diff --git a/.gitignore b/.gitignore
index b6e47617..a0745fe1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -104,7 +104,7 @@ celerybeat.pid
 # Environments
 .env
 .venv
-env/
+#env/
 venv/
 ENV/
 env.bak/
@@ -127,3 +127,14 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# segmentation
+./segmentation/output
+./segmentation/log
+*.pdparams
+*.pdopt
+./segmentation/pytorch_2_paddle.py
+./segmentation/readme.txt 
+setr.py.bak
+
+
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..359f5483
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,118 @@
+# Contribute Code
+
+You encourage and appreciate researchers and developers to contribute to project **PPViT**. 
+
+This document explains our workflow and working style.
+
+## Workflow
+
+PPViT uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/).  You can follow the listed steps for common contributions.
+
+### 1. Fork the repo
+
+  Please **file `Pull Requests` from your fork**. 
+  
+  To make a fork, just head over to our GitHub repo page and click the ["Fork"](https://help.github.com/articles/fork-a-repo/) button.
+   
+### 2. Clone the repo
+
+   To make a copy of your fork to your local env:
+
+   ```bash
+   $ git clone https://github.com/your-github-account/PPViT
+   $ cd PPViT
+   ```
+
+### 3. Create local `feature` branch
+
+   For daily works like adding a new feature or fixing a bug, open a `feature` branch based on `develop` branch before coding:
+
+   ```bash
+   $ git checkout develop
+   $ git checkout -b feature
+   ```
+   wher `feature` can be replaced with the name of your feature you are working on.
+
+### 4. Commit
+
+   Commit your code to the local repository **during and after** your coding.
+
+   ```shell
+   $ git add -A
+   $ git commit -m “message”
+   ```
+  
+### 5. Test
+
+   - We encourage writing `unittest` to test your class and method.
+   - Please test and report model performance on related datasets before you start to merge.
+ 
+### 6. Keep pulling
+
+   An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts.
+
+   ```bash
+   $ git remote add upstream https://github.com/xperzy/PPViT
+   $ git pull upstream develop
+   ```
+
+### 7. Push and file a `Pull Request`
+
+   1. **Push** your local work into your forked repo:
+
+      ```bash
+      $ git push origin my-cool-stuff
+      ```
+
+      The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/BR-IDL/PaddleViT) to pull your change into the official one.
+
+   2. To create a `Pull Request`, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/).
+
+      If your change is for fixing an issue, please write ["Fixes <issue-URL>"](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request.  Github would close the issue when the owners merge your pull request.
+
+      Please remember to specify some reviewers for your pull request.  If you don't know who are the right ones, please follow Github's recommendation.
+
+### 8. Delete local and remote `feature` branches
+
+   After merging into `develop` branch successfully, delete your `feature` branch.
+   To keep your local workspace and your fork clean, you might want to remove merged branches:
+
+   ```bash
+   $ git push origin :my-cool-stuff
+   $ git checkout develop
+   $ git pull upstream develop
+   $ git branch -d my-cool-stuff
+   ```
+
+## Code Review
+
+-  Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email.
+
+- Please answer reviewers' every comment.  If you are to follow the comment, please write "Done"; please give a reason otherwise.
+
+- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/).
+
+- Reduce the unnecessary commits.  Some developers commit often.  It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`.
+
+## Coding Standard
+
+### Code Style
+
+Our Python code follows the [PEP8 language guide](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_language_rules/) and [PEP8 style guide](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/).
+  
+### Use Pylint
+
+[Pylint](http://pylint.pycqa.org/en/latest/) is a Python code analysis tool that analyzes errors in Python code and finds code that does not meet coding style standards and has potential problems.
+
+### Comments and Annotations
+  
+To make it easier for others to use and generate online documents, please include a docstring for each function on each class method.
+  
+### Unit Tests
+
+Please remember to add related unit tests.
+
+- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/).
+
+Try to have unit tests for each function on each class method.
+  
diff --git a/PaddleViT.png b/PaddleViT.png
new file mode 100644
index 00000000..99c80984
Binary files /dev/null and b/PaddleViT.png differ
diff --git a/README.md b/README.md
index b42b9ad4..1300a6a3 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,392 @@
-# PPViT
-Implementation of SOTA visual transformers and mlp models on PaddlePaddle 2.0+
+# PaddlePaddle Vision Transformers #
+
+[![GitHub](https://img.shields.io/github/license/BR-IDL/PaddleViT?color=blue)](./LICENSE)
+[![GitHub Repo stars](https://img.shields.io/github/stars/BR-IDL/PaddleViT?style=social)](https://github.com/BR-IDL/PaddleViT/stargazers)
+
+
+<p align="center">    
+    <img src="./PaddleViT.png" width="100%"/>
+</p>
+ 
+## State-of-the-art Visual Transformer and MLP Models for PaddlePaddle ##
+
+:robot: PaddlePaddle Visual Transformers (`PaddleViT` or `PPViT`) is a collection of vision models beyond convolution. Most of the models are based on Visual Transformers, Visual Attentions, and MLPs, etc. PaddleViT also integrates popular layers, utilities, optimizers, schedulers, data augmentations, training/validation scripts for PaddlePaddle 2.1+. The aim is to reproduce a wide variety of state-of-the-art ViT and MLP models with full training/validation procedures. We are passionate about making cuting-edge CV techniques easier to use for everyone.
+
+:robot: PaddleViT provides models and tools for multiple vision tasks, such as classifications, object detection, semantic segmentation, GAN, and more. Each model architecture is defined in standalone python module and can be modified to enable quick research experiments. At the same time, pretrained weights can be downloaded and used to finetune on your own datasets. PaddleViT also integrates popular tools and modules for custimized dataset, data preprocessing, performance metrics, DDP and more.
+
+:robot: PaddleViT is backed by popular deep learning framework [PaddlePaddle](https://www.paddlepaddle.org/), we also provide tutorials and projects on [Paddle AI Studio](https://aistudio.baidu.com/aistudio/index). It's intuitive and straightforward to get started for new users.
+
+
+## Quick Links ##
+PaddleViT implements model architectures and tools for multiple vision tasks, go to the following links for detailed information.
+- [PaddleViT-Cls](./image_classification) for Image Classification
+- [PaddleViT-Det](./object_detection/DETR) for object detection
+- [PaddleViT-Seg](./semantic_segmentation) for Semantic Segmentation
+- [PaddleViT-GAN](./gan) for GANs.
+  
+We also provide tutorials:
+- Notebooks (coming soon)
+- Online Course (coming soon)
+
+
+## Model architectures ##
+
+### Image Classification (Transformers) ###
+1. **[ViT](./image_classification/ViT)** (from Google), released with paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929), by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+2. **[DeiT](./image_classification/DeiT)** (from Facebook and Sorbonne), released with paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877), by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+3. **[Swin Transformer](./image_classification/SwinTransformer)** (from Microsoft), released with paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030), by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+4. **[VOLO](./image_classification/VOLO)** (from Sea AI Lab and NUS), released with paper [VOLO: Vision Outlooker for Visual Recognition](https://arxiv.org/abs/2106.13112), by Li Yuan, Qibin Hou, Zihang Jiang, Jiashi Feng, Shuicheng Yan.
+5. **[CSwin Transformer](./image_classification/CSwin)** (from USTC and Microsoft), released with paper [CSWin Transformer: A General Vision Transformer Backbone with Cross-Shaped Windows
+](https://arxiv.org/abs/2107.00652), by Xiaoyi Dong, Jianmin Bao, Dongdong Chen, Weiming Zhang, Nenghai Yu, Lu Yuan, Dong Chen, Baining Guo.
+6. **[CaiT](./image_classification/CaiT)** (from Facebook and Sorbonne), released with paper [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239), by Hugo Touvron, Matthieu Cord, Alexandre Sablayrolles, Gabriel Synnaeve, Hervé Jégou.
+7. **[PVTv2](./image_classification/PVTv2)** (from NJU/HKU/NJUST/IIAI/SenseTime), released with paper [PVTv2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797), by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
+8. **[Shuffle Transformer](./image_classification/Shuffle_Transformer)** (from Tencent), released with paper [Shuffle Transformer: Rethinking Spatial Shuffle for Vision Transformer](https://arxiv.org/abs/2106.03650), by Zilong Huang, Youcheng Ben, Guozhong Luo, Pei Cheng, Gang Yu, Bin Fu.
+9. **[T2T-ViT](./image_classification/T2T_ViT)** (from NUS and YITU), released with paper [Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet
+](https://arxiv.org/abs/2101.11986), by Li Yuan, Yunpeng Chen, Tao Wang, Weihao Yu, Yujun Shi, Zihang Jiang, Francis EH Tay, Jiashi Feng, Shuicheng Yan.
+
+#### Coming Soon: ####
+1. **[CrossViT]()** (from IBM), released with paper [CrossViT: Cross-Attention Multi-Scale Vision Transformer for Image Classification](https://arxiv.org/abs/2103.14899), by Chun-Fu Chen, Quanfu Fan, Rameswar Panda.
+2. **[Focal Transformer]()** (from Microsoft), released with paper [Focal Self-attention for Local-Global Interactions in Vision Transformers](https://arxiv.org/abs/2107.00641), by Jianwei Yang, Chunyuan Li, Pengchuan Zhang, Xiyang Dai, Bin Xiao, Lu Yuan and Jianfeng Gao.
+3. **[HaloNet]()**, (from Google), released with paper [Scaling Local Self-Attention for Parameter Efficient Visual Backbones](https://arxiv.org/abs/2103.12731), by Ashish Vaswani, Prajit Ramachandran, Aravind Srinivas, Niki Parmar, Blake Hechtman, Jonathon Shlens.
+
+
+### Image Classification (MLPs) ###
+1. **[MLP-Mixer](./image_classification/MLP-Mixer)** (from Google), released with paper [MLP-Mixer: An all-MLP Architecture for Vision](https://arxiv.org/abs/2105.01601), by Ilya Tolstikhin, Neil Houlsby, Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Thomas Unterthiner, Jessica Yung, Andreas Steiner, Daniel Keysers, Jakob Uszkoreit, Mario Lucic, Alexey Dosovitskiy
+2. **[ResMLP](./image_classification/ResMLP)** (from Facebook/Sorbonne/Inria/Valeo), released with paper [ResMLP: Feedforward networks for image classification with data-efficient training](https://arxiv.org/abs/2105.03404), by Hugo Touvron, Piotr Bojanowski, Mathilde Caron, Matthieu Cord, Alaaeldin El-Nouby, Edouard Grave, Gautier Izacard, Armand Joulin, Gabriel Synnaeve, Jakob Verbeek, Hervé Jégou.
+3. **[gMLP](./image_classification/gMLP)** (from Google), released with paper [Pay Attention to MLPs](https://arxiv.org/abs/2105.08050), by Hanxiao Liu, Zihang Dai, David R. So, Quoc V. Le.
+
+
+
+
+
+
+### Detection ###
+1. **[DETR](./object_detection/DETR)** (from Facebook), released with paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872), by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+
+#### Coming Soon: ####
+1. **[Swin Transformer]()** (from Microsoft), released with paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030), by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+2. **[PVTv2]()** (from NJU/HKU/NJUST/IIAI/SenseTime), released with paper [PVTv2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797), by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
+3. **[Focal Transformer]()** (from Microsoft), released with paper [Focal Self-attention for Local-Global Interactions in Vision Transformers](https://arxiv.org/abs/2107.00641), by Jianwei Yang, Chunyuan Li, Pengchuan Zhang, Xiyang Dai, Bin Xiao, Lu Yuan and Jianfeng Gao.
+4. **[UP-DETR]()** (from Tencent), released with paper [UP-DETR: Unsupervised Pre-training for Object Detection with Transformers](https://arxiv.org/abs/2011.09094), by Zhigang Dai, Bolun Cai, Yugeng Lin, Junying Chen.
+
+
+
+
+### Semantic Segmentation ###
+#### Now: ####
+1. **[SETR](./semantic_segmentation)** (from Fudan/Oxford/Surrey/Tencent/Facebook), released with paper [Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers](https://arxiv.org/abs/2012.15840), by Sixiao Zheng, Jiachen Lu, Hengshuang Zhao, Xiatian Zhu, Zekun Luo, Yabiao Wang, Yanwei Fu, Jianfeng Feng, Tao Xiang, Philip H.S. Torr, Li Zhang.
+2. **[DPT](./semantic_segmentation)** (from Intel), released with paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413), by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+3. **[Swin Transformer](./semantic_segmentation)** (from Microsoft), released with paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030), by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+2. **[Segmenter](./semantic_segmentation)** (from Inria), realeased with paper [Segmenter: Transformer for Semantic Segmentation](https://arxiv.org/pdf/2105.05633.pdf), by Robin Strudel, Ricardo Garcia, Ivan Laptev, Cordelia Schmid.
+3. **[Trans2seg](./semantic_segmentation)** (from HKU/Sensetime/NJU), released with paper [Segmenting Transparent Object in the Wild with Transformer](https://arxiv.org/pdf/2101.08461.pdf), by Enze Xie, Wenjia Wang, Wenhai Wang, Peize Sun, Hang Xu, Ding Liang, Ping Luo.
+4. **[SegFormer](./semantic_segmentation)** (from HKU/NJU/NVIDIA/Caltech), released with paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203), by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+
+#### Coming Soon:  ####
+1. **[FTN]()** (from Baidu), released with paper [Fully Transformer Networks for Semantic Image Segmentation](https://arxiv.org/pdf/2106.04108.pdf), by Sitong Wu, Tianyi Wu, Fangjian Lin, Shengwei Tian, Guodong Guo.
+2. **[Shuffle Transformer]()** (from Tencent), released with paper [Shuffle Transformer: Rethinking Spatial Shuffle for Vision Transformer](https://arxiv.org/abs/2106.03650), by Zilong Huang, Youcheng Ben, Guozhong Luo, Pei Cheng, Gang Yu, Bin Fu
+3. **[Focal Transformer]()** (from Microsoft), released with paper [Focal Self-attention for Local-Global Interactions in Vision Transformers](https://arxiv.org/abs/2107.00641), by Jianwei Yang, Chunyuan Li, Pengchuan Zhang, Xiyang Dai, Bin Xiao, Lu Yuan and Jianfeng Gao.
+4. **[CSwin Transformer]()** (from USTC and Microsoft), released with paper [CSWin Transformer: A General Vision Transformer Backbone with Cross-Shaped Windows
+](https://arxiv.org/abs/2107.00652), by Xiaoyi Dong, Jianmin Bao, Dongdong Chen, Weiming Zhang, Nenghai Yu, Lu Yuan, Dong Chen, Baining Guo.
+
+
+### GAN ###
+1. **[TransGAN](./gan/transGAN)** (from Seoul National University and NUUA), released with paper [TransGAN: Two Pure Transformers Can Make One Strong GAN, and That Can Scale Up](https://arxiv.org/abs/2102.07074), by Yifan Jiang, Shiyu Chang, Zhangyang Wang.
+2. **[Styleformer](./gan/Styleformer)** (from Facebook and Sorbonne), released with paper [Styleformer: Transformer based Generative Adversarial Networks with Style Vector](https://arxiv.org/abs/2106.07023), by Jeeseung Park, Younggeun Kim.
+#### Coming Soon: ####
+1. **[ViTGAN]()** (from UCSD/Google), released with paper [ViTGAN: Training GANs with Vision Transformers](https://arxiv.org/pdf/2107.04589), by Kwonjoon Lee, Huiwen Chang, Lu Jiang, Han Zhang, Zhuowen Tu, Ce Liu.
+
+
+
+## Installation
+### Prerequistites
+* Linux/MacOS/Windows
+* Python 3.6/3.7
+* PaddlePaddle 2.1.0+
+* CUDA10.2+
+### Installation
+1. Create a conda virtual environment and activate it.
+   ```shell
+   conda create -n paddlevit python=3.7 -y
+   conda activate paddlevit
+   ```
+2. Install PaddlePaddle following the official instructions, e.g.,
+   ```shell
+   conda install paddlepaddle-gpu==2.1.2 cudatoolkit=10.2 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/Paddle/
+   ```
+   > Note: please change the paddlepaddle version and cuda version accordingly to your environment.
+
+3. Install dependency packages
+    * General dependencies:
+        ```
+        pip install yacs, yaml
+        ```
+    * Packages for Segmentation:
+        ```
+        pip install cityscapesScripts, detail
+        ```
+    * Packages for GAN:
+        ```
+        pip install lmdb
+        ```
+4. Clone project from GitHub
+    ```
+    git clone https://github.com/BR-IDL/PaddleViT.git 
+    ```
+
+
+
+### Docker Install ###
+(coming soon)
+
+
+
+
+## Results (Ported Weights) ## 
+### Image Classification ###
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| vit_base_patch16_224           | 84.58 | 97.30 | 224        | 0.875    | bicubic      | [google](https://drive.google.com/file/d/13D9FqU4ISsGxWXURgKW9eLOBV-pYPr-L/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1ms3o2fHMQpIoVqnEHitRtA)(qv4n) |
+| vit_base_patch16_384           | 85.99 | 98.00 | 384        | 1.0      | bicubic      | [google](https://drive.google.com/file/d/1kWKaAgneDx0QsECxtf7EnUdUZej6vSFT/view?usp=sharing)/[baidu](https://pan.baidu.com/s/15ggLdiL98RPcz__SXorrXA)(wsum) |
+| vit_large_patch16_224          | 85.81 | 97.82 | 224        | 0.875    | bicubic       | [google](https://drive.google.com/file/d/1jgwtmtp_cDWEhZE-FuWhs7lCdpqhAMft/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1HRxUJAwEiKgrWnJSjHyU0A)(1bgk) |
+| swin_base_patch4_window7_224   | 85.27 | 97.56 | 224        | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1yjZFJoJeDFIfsxh9x10XGqCb8s2-Gtbp/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1AseY3CKmJvlxoSwXnxHEwA)(wyck) |
+| swin_base_patch4_window12_384  | 86.43 | 98.07 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1ThmGsTDZ8217-Zuo9o5EGLfzw8AI6N0w/view?usp=sharing)/[baidu](https://pan.baidu.com/s/10E3F9jqBeBTcasIvJ8iMzg)(4a95) |
+| swin_large_patch4_window12_384 | 87.14 | 98.23 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1f30Mt80g5yLfEiViT4-kMLpyDjTUTV5B/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1w5w8QNfg0zY3nSfGs-Tx3A)(j71u) |
+| pvtv2_b0 			| 70.47	| 90.16	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/1wkx4un6y7V87Rp_ZlD4_pV63QRst-1AE/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1mab4dOtBB-HsdzFJYrvgjA)(dxgb) |
+| pvtv2_b1 			| 78.70	| 94.49	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/11hqLxL2MTSnKPb-gp2eMZLAzT6q2UsmG/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1Ur0s4SEOxVqggmgq6AM-sQ)(2e5m) |
+| pvtv2_b2 			| 82.02	| 95.99	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/1-KY6NbS3Y3gCaPaUam0v_Xlk1fT-N1Mz/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1FWx0QB7_8_ikrPIOlL7ung)(are2) |
+| pvtv2_b3 			| 83.14	| 96.47	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/16yYV8x7aKssGYmdE-YP99GMg4NKGR5j1/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1ge0rBsCqIcpIjrVxsrFhnw)(nc21) |
+| pvtv2_b4 			| 83.61	| 96.69	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/1gvPdvDeq0VchOUuriTnnGUKh0N2lj-fA/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1VMSD_Kr_hduCZ5dxmDbLoA)(tthf) |
+| pvtv2_b5 			| 83.77	| 96.61	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/1OHaHiHN_AjsGYBN2gxFcQCDhBbTvZ02g/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1ey4agxI2Nb0F6iaaX3zAbA)(9v6n) |
+| pvtv2_b2_linear 	| 82.06	| 96.04	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/1hC8wE_XanMPi0_y9apEBKzNc4acZW5Uy/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1IAhiiaJPe-Lg1Qjxp2p30w)(a4c8) |
+| mlp_mixer_b16_224                  | 76.60 | 92.23 | 224        | 0.875    | bicubic       | [google](https://drive.google.com/file/d/1ZcQEH92sEPvYuDc6eYZgssK5UjYomzUD/view?usp=sharing)/[baidu](https://pan.baidu.com/s/12nZaWGMOXwrCMOIBfUuUMA)(xh8x) |
+| mlp_mixer_l16_224           | 72.06 | 87.67 | 224        | 0.875      | bicubic      | [google](https://drive.google.com/file/d/1mkmvqo5K7JuvqGm92a-AdycXIcsv1rdg/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1AmSVpwCaGR9Vjsj_boL7GA)(8q7r) |
+| resmlp_24_224                  | 79.38 | 94.55 | 224        | 0.875    | bicubic       | [google](https://drive.google.com/file/d/15A5q1XSXBz-y1AcXhy_XaDymLLj2s2Tn/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1nLAvyG53REdwYNCLmp4yBA)(jdcx) |
+| resmlp_36_224             | 79.77 | 94.89 | 224        | 0.875      | bicubic      | [google](https://drive.google.com/file/d/1WrhVm-7EKnLmPU18Xm0C7uIqrg-RwqZL/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1QD4EWmM9b2u1r8LsnV6rUA)(33w3) |
+| resmlp_big_24_224         | 81.04 | 95.02 | 224        | 0.875      | bicubic      | [google](https://drive.google.com/file/d/1KLlFuzYb17tC5Mmue3dfyr2L_q4xHTZi/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1oXU6CR0z7O0XNwu_UdZv_w)(r9kb) |
+| resmlp_big_24_distilled_224 | 83.59 | 96.65 | 224        | 0.875      | bicubic      | [google](https://drive.google.com/file/d/199q0MN_BlQh9-HbB28RdxHj1ApMTHow-/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1yUrfbqW8vLODDiRV5WWkhQ)(4jk5) |
+| gmlp_s16_224                   | 79.64 | 94.63 | 224        | 0.875    | bicubic       | [google](https://drive.google.com/file/d/1TLypFly7aW0oXzEHfeDSz2Va4RHPRqe5/view?usp=sharing)/[baidu](https://pan.baidu.com/s/13UUz1eGIKyqyhtwedKLUMA)(bcth) |
+| volo_d5_224_86.10              | 86.08 | 97.58 | 224        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1GBOBPCBJYZfWybK-Xp0Otn0N4NXpct0G/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1t9gPLRAOkdXaG55fVADQZg)(td49) |
+| volo_d5_512_87.07              | 87.05 | 97.97 | 512        | 1.15     | bicubic       | [google](https://drive.google.com/file/d/1Phf_wHsjRZ1QrZ8oFrqsYuhDr4TXrVkc/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1X-WjpNqvWva2M977jgHosg)(irik) |
+| cait_xxs24_224                 | 78.38 | 94.32 | 224        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1LKsQUr824oY4E42QeUEaFt41I8xHNseR/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1YIaBLopKIK5_p7NlgWHpGA)(j9m8) |
+| cait_s24_384                   | 85.05 | 97.34 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1GU0esukDvMg3u40FZB_5GiB6qpShjvGh/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1qvhNckJjcEf5HyVn8LuEeA)(qb86) |
+| cait_m48_448                   | 86.49  | 97.75 | 448        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1lJSP__dVERBNFnp7im-1xM3s_lqEe82-/view?usp=sharing)/[baidu](https://pan.baidu.com/s/179MA3MkG2qxFle0K944Gkg)(imk5) |
+| deit_base_distilled_patch16_224| 83.32  | 96.49 | 224        | 0.875    | bicubic       | [google](https://drive.google.com/file/d/12_x6-NN3Jde2BFUih4OM9NlTwe9-Xlkw/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1ZnmAWgT6ewe7Vl3Xw_csuA)(5f2g) |
+| deit_base_distilled_patch16_384| 85.43  | 97.33 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1i5H_zjSdHfM-Znv89DHTv9ChykWrIt8I/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1PQsQIci4VCHY7l2tCzMklg)(qgj2) |
+| shuffle_vit_tiny_patch4_window7| 82.39  | 96.05 | 224        | 0.875      | bicubic       | [google](https://drive.google.com/file/d/1ffJ-tG_CGVXztPEPQMaT_lUoc4hxFy__/view?usp=sharing)/[baidu](https://pan.baidu.com/s/19DhlLIFyPGOWtyq_c83ZGQ)(8a1i) |
+| shuffle_vit_small_patch4_window7| 83.53 | 96.57 | 224        | 0.875      | bicubic       | [google](https://drive.google.com/file/d/1du9H0SKr0QH9GQjhWDOXOnhpSVpfbb8X/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1rM2J8BVwxQ3kRZoHngwNZA)(xwh3) |
+| shuffle_vit_base_patch4_window7| 83.95  | 96.91 | 224        | 0.875      | bicubic       | [google](https://drive.google.com/file/d/1sYh808AyTG3-_qv6nfN6gCmyagsNAE6q/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1fks_IYDdnXdAkCFuYHW_Nw)(1gsr) |
+| cswin_tiny_224  | 82.81  | 96.30 | 224        | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1l-JY0u7NGyD6SjkyiyNnDx3wFFT1nAYO/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1L5FqU7ImWAhQHAlSilqVAw)(4q3h) |
+| cswin_small_224 | 83.60  | 96.58 | 224        | 0.9      | bicubic       | [google](https://drive.google.com/file/d/10eEBk3wvJdQ8Dy58LvQ11Wk1K2UfPy-E/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1FiaNiWyAuWu1IBsUFLUaAw)(gt1a) |
+| cswin_base_224  | 84.23  | 96.91 | 224        | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1YufKh3DKol4-HrF-I22uiorXSZDIXJmZ/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1koy8hXyGwvgAfUxdlkWofg)(wj8p) |
+| cswin_large_224 | 86.52  | 97.99 | 224        | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1V1hteGK27t1nI84Ac7jdWfydBLLo7Fxt/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1KgIX6btML6kPiPGkIzvyVA)(b5fs) |
+| cswin_base_384  | 85.51  | 97.48 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1qCaFItzFoTYBo-4UbGzL6M5qVDGmJt4y/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1WNkY7o_vP9KJ8cd5c7n2sQ)(rkf5) |
+| cswin_large_384 | 87.49  | 98.35 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1LRN_6qUz71yP-OAOpN4Lscb8fkUytMic/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1eCIpegPj1HIbJccPMaAsew)(6235) |
+| t2t_vit_7      | 71.68 | 90.89 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1YkuPs1ku7B_udydOf_ls1LQvpJDg_c_j/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1jVNsz37gatLCDaOoU3NaMA)(1hpa) |
+| t2t_vit_10     | 75.15 | 92.80 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1H--55RxliMDlOCekn7FpKrHDGsUkyrJZ/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1nbdb4PFMq4nsIp8HrNxLQg)(ixug) |
+| t2t_vit_12     | 76.48 | 93.49 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1stnIwOwaescaEcztaF1QjI4NK4jaqN7P/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1DcMzq9WeSwrS3epv6jKJXw)(qpbb) |
+| t2t_vit_14     | 81.50 | 95.67 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1HSvN3Csgsy7SJbxJYbkzjUx9guftkfZ1/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1wcfh22uopBv7pS7rKcH_iw)(c2u8) |
+| t2t_vit_19     | 81.93 | 95.74 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1eFnhaL6I33pHCQw2BaEE0Oet9CnjmUf_/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1Hpyc5hBYo1zqoXWpryegnw)(4in3) |
+| t2t_vit_24     | 82.28 | 95.89 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1Z7nZCHeFp0AhIkGYcMAFkKdkGN0yXtpv/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1Hpyc5hBYo1zqoXWpryegnw)(4in3) |
+| t2t_vit_t_14   | 81.69 | 95.85 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/16li4voStt_B8eWDXqJt7s20OT_Z8L263/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1Hpyc5hBYo1zqoXWpryegnw)(4in3) |
+| t2t_vit_t_19   | 82.44 | 96.08 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1Ty-42SYOu15Nk8Uo6VRTJ7J0JV_6t7zJ/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1YdQd6l8tj5xMCWvcHWm7sg)(mier) |
+| t2t_vit_t_24   | 82.55 | 96.07 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1cvvXrGr2buB8Np2WlVL7n_F1_CnI1qow/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1BMU3KX_TRmPxQ1jN5cmWhg)(6vxc) |
+| t2t_vit_14_384 | 83.34 | 96.50 | 384   | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1Yuso8WD7Q8Lu_9I8dTvAvkcXXtPSkmnm/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1AOMhyVRF9zPqJe-lTrd7pw)(r685) |
+
+
+
+
+### Object Detection ###
+| Model | backbone  | box_mAP | Model                                                                                                                                                       |
+|-------|-----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| DETR  | ResNet50  | 42.0    | [google](https://drive.google.com/file/d/1ruIKCqfh_MMqzq_F4L2Bv-femDMjS_ix/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1J6lB1mezd6_eVW3jnmohZA)(n5gk) |
+| DETR  | ResNet101 | 43.5    | [google](https://drive.google.com/file/d/11HCyDJKZLX33_fRGp4bCg1I14vrIKYW5/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1_msuuAwFMNbAlMpgUq89Og)(bxz2) |
+
+### Semantic Segmentation ###
+#### Pascal Context ####
+|Model      | Backbone  | Batch_size | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint      |     ConfigFile  |
+|-----------|-----------|------------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+|SETR_Naive | ViT_large |     16     |   52.06   |      52.57        | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)     | [google](https://drive.google.com/file/d/1AUyBLeoAcMH0P_QGer8tdeU44muTUOCA/view?usp=sharing)/[baidu](https://pan.baidu.com/s/11XgmgYG071n_9fSGUcPpDQ)(xdb8)   | [config](semantic_segmentation/configs/setr/SETR_Naive_Large_480x480_80k_pascal_context_bs_16.yaml) | 
+|SETR_PUP   | ViT_large |     16     |   53.90   |       54.53    | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)     | [google](https://drive.google.com/file/d/1IY-yBIrDPg5CigQ18-X2AX6Oq3rvWeXL/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1v6ll68fDNCuXUIJT2Cxo-A)(6sji) | [config](semantic_segmentation/configs/setr/SETR_PUP_Large_480x480_80k_pascal_context_bs_16.yaml) |
+|SETR_MLA   | ViT_Large |     8      |   54.39   |       55.16       | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)     | [google](https://drive.google.com/file/d/1utU2h0TrtuGzRX5RMGroudiDcz0z6UmV/view)/[baidu](https://pan.baidu.com/s/1Eg0eyUQXc-Mg5fg0T3RADA)(wora)| [config](semantic_segmentation/configs/setr/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml) |
+|SETR_MLA   | ViT_large |     16     |   55.01   |       55.87        | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)     | [google](https://drive.google.com/file/d/1SOXB7sAyysNhI8szaBqtF8ZoxSaPNvtl/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1jskpqYbazKY1CKK3iVxAYA)(76h2) | [config](semantic_segmentation/configs/setr/SETR_MLA_Large_480x480_80k_pascal_context_bs_16.yaml) |
+
+#### Cityscapes ####
+|Model      | Backbone  | Batch_size | Iteration | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint     |     ConfigFile  |
+|-----------|-----------|------------|-----------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+|SETR_Naive | ViT_Large |     8      |     40k   |   76.71   |       79.03        | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)      | [google](https://drive.google.com/file/d/1QialLNMmvWW8oi7uAHhJZI3HSOavV4qj/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1F3IB31QVlsohqW8cRNphqw)(g7ro)  |  [config](semantic_segmentation/configs/setr/SETR_Naive_Large_768x768_40k_cityscapes_bs_8.yaml)| 
+|SETR_Naive | ViT_Large |     8      |     80k   |   77.31   |       79.43      | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)      | [google](https://drive.google.com/file/d/1RJeSGoDaOP-fM4p1_5CJxS5ku_yDXXLV/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1XbHPBfaHS56HlaMJmdJf1A)(wn6q)   |  [config](semantic_segmentation/configs/setr/SETR_Naive_Large_768x768_80k_cityscapes_bs_8.yaml)| 
+|SETR_PUP   | ViT_Large |     8      |     40k   |   77.92   |       79.63        |  [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)     | [google](https://drive.google.com/file/d/12rMFMOaOYSsWd3f1hkrqRc1ThNT8K8NG/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1H8b3valvQ2oLU9ZohZl_6Q)(zmoi)    | [config](semantic_segmentation/configs/setr/SETR_PUP_Large_768x768_40k_cityscapes_bs_8.yaml)| 
+|SETR_PUP   | ViT_Large |     8      |     80k   |   78.81   |       80.43     |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1tkMhRzO0XHqKYM0lojE3_g)(f793)    | [config](semantic_segmentation/configs/setr/SETR_PUP_Large_768x768_80k_cityscapes_bs_8.yaml)| 
+|SETR_MLA   | ViT_Large |     8      |     40k   |   76.70    |       78.96      |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1sUug5cMKSo6mO7BEI4EV_w)(qaiw)    | [config](semantic_segmentation/configs/setr/SETR_MLA_Large_768x768_40k_cityscapes_bs_8.yaml)| 
+|SETR_MLA   | ViT_Large |     8      |     80k   |  77.26     |       79.27      |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1IqPZ6urdQb_0pbdJW2i3ow)(6bgj)    | [config](semantic_segmentation/configs/setr/SETR_MLA_Large_768x768_80k_cityscapes_bs_8.yaml)| 
+
+
+#### ADE20K ####
+|Model      | Backbone  | Batch_size | Iteration | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint     |     ConfigFile  |
+|-----------|-----------|------------|-----------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+|SETR_Naive | ViT_Large |     16      |     160k   | 47.57   |      48.12        |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1_AY6BMluNn71UiMNZbnKqQ)(lugq)   | [config](semantic_segmentation/configs/setr/SETR_Naive_Large_512x512_160k_ade20k_bs_16.yaml)| 
+|SETR_PUP   | ViT_Large |     16      |     160k   |  49.12   |      49.51        |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1N83rG0EZSksMGZT3njaspg)(udgs)    | [config](semantic_segmentation/configs/setr/SETR_PUP_Large_512x512_160k_ade20k_bs_16.yaml)| 
+|SETR_MLA   | ViT_Large |     8      |     160k   |  47.80   |       49.34        |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1L83sdXWL4XT02dvH2WFzCA)(mrrv)    | [config](semantic_segmentation/configs/setr/SETR_MLA_Large_512x512_160k_ade20k_bs_8.yaml)| 
+|DPT        | ViT_Large |     16     |     160k   |  47.21   |       -        |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)      |[baidu](https://pan.baidu.com/s/1PCSC1Kvcg291gqp6h5pDCg)(ts7h)   |  [config](semantic_segmentation/configs/dpt/DPT_Large_480x480_160k_ade20k_bs_16.yaml)
+|Segmenter  | ViT_Tiny  |     16     |     160k   |  38.45   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1nZptBc-IY_3PFramXSlovQ)(1k97)   |  [config](semantic_segmentation/configs/segmenter/segmenter_Tiny_512x512_160k_ade20k_bs_16.yaml)
+|Segmenter  | ViT_Small |     16     |     160k   |  46.07   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1gKE-GEu7gX6dJsgtlvrmWg)(i8nv)   |  [config](semantic_segmentation/configs/segmenter/segmenter_small_512x512_160k_ade20k_bs_16.yaml)
+|Segmenter  | ViT_Base  |     16     |     160k   |  49.08   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1qb7HEtKW0kBSP6iv-r_Hjg)(hxrl)   |  [config](semantic_segmentation/configs/segmenter/segmenter_Base_512x512_160k_ade20k_bs_16.yaml) |
+|Segmenter  | ViT_Large  |     16     |     160k   |  51.82   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/121FOwpsYue7Z2Rg3ZlxnKg)(wdz6)   |  [config](semantic_segmentation/configs/segmenter/segmenter_Tiny_512x512_160k_ade20k_bs_16.yaml)
+|Segmenter_Linear  | DeiT_Base |     16     |     160k   |  47.34   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1Hk_zcXUIt_h5sKiAjG2Pog)(5dpv)   |  [config](semantic_segmentation/configs/segmenter/segmenter_Base_distilled_512x512_160k_ade20k_bs_16.yaml)
+|Segmenter  | DeiT_Base |     16     |     160k   |  49.27   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1-TBUuvcBKNgetSJr0CsAHA)(3kim)   |  [config](semantic_segmentation/configs/segmenter/segmenter_Base_distilled_512x512_160k_ade20k_bs_16.yaml) |
+|Segformer  | MIT-B0 |     16     |     160k   |  38.37   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1WOD9jGjQRLnwKrRYzgBong)(ges9)   |  [config](semantic_segmentation/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.yaml) |
+|Segformer  | MIT-B1 |     16     |     160k   |  42.20   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1aiSBXMd8nP82XK7sSZ05gg)(t4n4)   |  [config](semantic_segmentation/configs/segmenter/segformer_mit-b1_512x512_160k_ade20k.yaml) |
+|Segformer  | MIT-B2 |     16     |     160k   |  46.38   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1wFFh-K5t46YktkfoWUOTAg)(h5ar)   |  [config](semantic_segmentation/configs/segmenter/segformer_mit-b2_512x512_160k_ade20k.yaml) |
+|Segformer  | MIT-B3 |     16     |     160k   |  48.35   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1IwBnDeLNyKgs-xjhlaB9ug)(g9n4)   |  [config](semantic_segmentation/configs/segmenter/segformer_mit-b3_512x512_160k_ade20k.yaml) |
+|Segformer  | MIT-B4 |     16     |     160k   |  49.01   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1a25fCVlwJ-1TUh9HQfx7YA)(e4xw)   |  [config](semantic_segmentation/configs/segmenter/segformer_mit-b4_512x512_160k_ade20k.yaml) |
+|Segformer  | MIT-B5 |     16     |     160k   |  49.73   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/15kXXxKEjjtJv-BmrPnSTOw)(uczo)   |  [config](semantic_segmentation/configs/segmenter/segformer_mit-b5_512x512_160k_ade20k.yaml) |
+| UperNet  | Swin_Tiny |     16     |     160k   |  44.90   |       45.37     |   -      |[baidu](https://pan.baidu.com/s/1S8JR4ILw0u4I-DzU4MaeVQ)(lkhg)   |  [config](semantic_segmentation/configs/upernet_swin/upernet_swin_tiny_patch4_windown7_512x512_160k_ade20k.yaml) |
+| UperNet  | Swin_Small |     16     |     160k   |  47.88   |       48.90      |   -      |[baidu](https://pan.baidu.com/s/17RKeSpuWqONVptQZ3B4kEA)(vvy1)   |  [config](semantic_segmentation/configs/upernet_swin/upernet_swin_small_patch4_windown7_512x512_160k_ade20k.yaml) |
+| UperNet  | Swin_Base |     16     |     160k   |   48.59   |       49.04      |   -      |[baidu](https://pan.baidu.com/s/1bM15KHNsb0oSPblQwhxbgw)(y040)   |  [config](semantic_segmentation/configs/upernet_swin/upernet_swin_base_patch4_windown7_512x512_160k_ade20k.yaml) |
+
+#### Trans10kV2 ####
+|Model      | Backbone  | Batch_size | Iteration | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint     |     ConfigFile  |
+|-----------|-----------|------------|-----------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+|Trans2seg_Medium | Resnet50c |     16      |    80k    |  72.25  |      -        |   [google](https://drive.google.com/file/d/1C6nMg6DgQ73wzF21UwDVxmkcRTeKngnK/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1hs0tbSGIeMLLGMq05NN--w)(4dd5)    | [google](https://drive.google.com/file/d/1zGEBEN27CQMgZBYqqAg_agJE6CPLOpYW/view?usp=sharing)/[baidu](https://pan.baidu.com/s/102GUBeoEPMqMEqF3smgyCA)(qcb0)   | [config](semantic_segmentation/configs/trans2seg/Trans2Seg_medium_512x512_80k_trans10kv2_bs_16.yaml)| 
+
+### GAN ###
+| Model                          | FID | Image Size | Crop_pct | Interpolation | Model        |
+|--------------------------------|-----|------------|----------|---------------|--------------|
+| styleformer_cifar10            |2.73 | 32         | 1.0      | lanczos       |[google](https://drive.google.com/file/d/1iW76QmwbYz6GeAPQn8vKvsG0GvFdhV4T/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1Ax7BNEr1T19vgVjXG3rW7g)(ztky)  |
+| styleformer_stl10              |15.65| 48         | 1.0      | lanczos       |[google](https://drive.google.com/file/d/15p785y9eP1TeoqUcHPbwFPh98WNof7nw/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1rSORxMYAiGkLQZ4zTA2jcg)(i973)|
+| styleformer_celeba             |3.32 | 64         | 1.0      | lanczos       |[google](https://drive.google.com/file/d/1_YauwZN1osvINCboVk2VJMscrf-8KlQc/view?usp=sharing)/[baidu](https://pan.baidu.com/s/16NetcPxLQF9C_Zlp1SpkLw)(fh5s) |
+| styleformer_lsun               | 9.68 | 128        | 1.0      | lanczos       |[google](https://drive.google.com/file/d/1i5kNzWK04ippFSmrmcAPMItkO0OFukTd/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1jTS9ExAMz5H2lhue4NMV2A)(158t)|
+> *The results are evaluated on Cifar10, STL10, Celeba and LSUNchurch dataset, using **fid50k_full** metric.
+
+
+## Quick Demo for Image Classification
+To use the model with pretrained weights, go to the specific subfolder e.g., `/image_classification/ViT/`, then download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `。、configs/`.  
+
+Assume the downloaded weight file is stored in `./vit_base_patch16_224.pdparams`, to use the `vit_base_patch16_224` model in python:
+```python
+from config import get_config
+from visual_transformer import build_vit as build_model
+# config files in ./configs/
+config = get_config('./configs/vit_base_patch16_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./vit_base_patch16_224')
+model.set_dict(model_state_dict)
+```
+> :robot: See the README file in each model folder for detailed usages.
+
+
+### Evaluation ###
+To evaluate ViT model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/vit_base_patch16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./vit_base_patch16_224'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/vit_base_patch16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./vit_base_patch16_224'
+```
+
+</details>
+
+
+### Training ###
+To train the ViT model on ImageNet2012 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/vit_base_patch16_224.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=32 \
+  -data_path='/dataset/imagenet' \
+```
+
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/vit_base_patch16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+
+
+## Features ##
+1. State-of-the-art
+   - State-of-the-art transformer models for multiple CV tasks
+   - State-of-the-art data processings and training methods 
+   - We keep pushing it forward.
+
+2. Easy-to-use tools
+   - Easy configs for model vairants
+   - Modular design for utiliy functions and tools
+   - Low barrier for educators and practitioners
+   - Unified framework for all the models
+
+3. Easily customizable to your needs
+   - Examples for each model to reproduce the results
+   - Model implementations are exposed for you to customize
+   - Model files can be used independently for quick experiments
+
+4. High Performance
+   - DDP with a single GPU per process.
+   - Mixed-precision support (coming soon)
+
+
+## Contributing ##
+* We encourage and appreciate your contribution to **PaddleViT** project, please refer to our workflow and work styles by [CONTRIBUTING.md](./CONTRIBUTING.md)
+
+
+## Licenses ##
+* This repo is under the Apache-2.0 license. 
+
+## Contact ##
+* Please raise an issue on GitHub.
diff --git a/docs/paddlevit-config.md b/docs/paddlevit-config.md
new file mode 100644
index 00000000..b5af2154
--- /dev/null
+++ b/docs/paddlevit-config.md
@@ -0,0 +1,129 @@
+## PaddleViT: How to use config?
+> sample code: [here](../image_classification/ViT/config.py)
+
+This document presents the basics of `config` that used in **PaddleViT** project. 
+
+The core module used in PPViT `config` is [yacs](https://github.com/rbgirshick/yacs) (0.1.8+). Similar as other projects, PPViT `config` supports loading from [yaml](https://yaml.org/) file, and configarable using python [ArgumentParser](https://docs.python.org/3/library/argparse.html).
+
+> Full usage of `yacs` can be found in https://github.com/rbgirshick/yacs
+
+### 1. Installation
+#### 1.1 Install by `pip`
+To intall `yacs` version `0.1.8`:
+```shell
+$ pip install yacs==0.1.8
+```
+#### 1.2 Install from source
+You can also install `yacs` from gitub:
+```shell
+$ git clone https://github.com/rbgirshick/yacs.git
+$ cd yacs
+$ python setup.py install
+```
+
+### 2. Basic Concepts and Usage
+#### 1. CfgNode
+`CfgNode` represents an internal node in the configuration tree. It is a `dict`-like container, which allows attribute-based access to its keys.
+```python
+from yacs.config import CfgNode as CN
+
+_C = CN()
+
+_C.NUM_GPUS = 4
+_C.NUM_WORKERS = 8
+_C.BATCH_SIZE = 128
+
+def get_config():
+    return _C.clone()
+```
+#### 2. Read `.yaml` file using `merge_from_file()`
+`yacs` allows reading YAML file to override `CfgNode`. You can create a `.yaml` file for each experiment, which will only change the options in that expriment.
+
+Some basic format in YAML file:
+```YAML
+key:    # YAML uses 'key: value' paris, separated using ':'
+    child_key: value    # indent can be used to show different levels
+    child_KEY2: value3  # YAML is case sensitive
+    c_arr: [val1, val2, val3]   # array can be used in value
+    c_bool: True    # True/true/TRUE are all OK
+    c_float: 3.1415 # float is allowed
+    c_string: no quote is allowed # "", '', no quote are all OK
+```
+
+`merge_from_file()` can be used to override current `CfgNode`:
+```python
+cfg = get_config()
+cfg.merge_from_file('experiment_1.yaml')
+print(cfg)
+```
+
+#### 3. Override by `ArgumentParser`
+You can write your own method to update config using python `ArgumentParser`, e.g.:
+```python
+def update_config(config, args)
+    if args.cfg:    # update from .yaml file
+        upate_config_from_file(config, args.cfg)
+    if args.batch_size: # update BATCH_SIZE
+        config.BATCH_SIZE = args.batch_size
+    if args.eval:
+        config.EVAL = True
+    return config
+```
+
+
+
+
+### 4. Practical Guide of using config for PPViT:
+#### STEP 1: Create config.py
+Create a python file config.py, which is the place to define **all the configurable options**. It should be well documented and provide suitable default values for all options. 
+Typically, `config.py` should have:
+- `DATA`: defines the dataset path, input image size, and batch_size, etc.
+- `MODEL`:
+    - General options for your model, such as model name, num_classes, etc.
+    - `TRANS`: transformer related options, such as mlp dimention, hidden dimension, number of heads, etc.
+- `TRAIN`: training related options, such as num of epochs, lr, weight decay, etc.
+
+In `config.py`, you should implement `update_config(config, args)`, which reads current `config` and `args` from `ArgumentParser` to update the config using commandline options.
+
+#### STEP 2: 
+In your `main.py`, create `ArgumentParser` which includes all the options in `update_config(config, args)` method from `config.py`, e.g.:
+```python
+  parser = argparse.ArgumentParser('ViT')
+  parser.add_argument('-cfg', type=str, default=None)
+  parser.add_argument('-dataset', type=str, default=None)
+  parser.add_argument('-batch_size', type=int, default=None)
+  parser.add_argument('-image_size', type=int, default=None)
+  parser.add_argument('-data_path', type=str, default=None)
+  parser.add_argument('-ngpus', type=int, default=None)
+  parser.add_argument('-pretrained', type=str, default=None)
+  parser.add_argument('-eval', action='store_true')
+  args = parser.parse_args()
+
+  # get default config
+  config = get_config()
+  # update config by arguments
+  config = update_config(config, args)
+```
+
+Then you can use the attribute-based access to get the config option values.
+
+#### STEP 3:
+You should create a single `.yaml` file for each experiment, e.g.,
+```yaml
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: ViT
+    NAME: vit_large_patch16_224
+    TRANS:
+    PATCH_SIZE: 16
+    HIDDEN_SIZE: 1024
+    MLP_DIM: 4096 # same as mlp_ratio = 4.0
+    NUM_LAYERS: 24
+    NUM_HEADS: 16
+    QKV_BIAS: True
+```
+
+If you set the command line argument `-cfg` to the `.yaml` file path, the config will be override with the file options. 
+> **Note:** the `.yaml` overrides the config before the `args`, so the options in `args` are the current options.
diff --git a/docs/paddlevit-multi-gpu.md b/docs/paddlevit-multi-gpu.md
new file mode 100644
index 00000000..8789e997
--- /dev/null
+++ b/docs/paddlevit-multi-gpu.md
@@ -0,0 +1,140 @@
+## PaddleViT: How to use multi-gpu?
+This document presents **how to use** and **how to implement** multi-gpu (single node) for training and validation in `PaddleViT` for training and validating your model. 
+
+`PaddleViT` implements multi-gpu schemes based on `paddle.distributed` package and we also hack some useful functions for inter-gpu communication and data transfer.
+
+> Detailed official `paddle.distribued` docs can be found: [here](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/Overview_cn.html)
+
+### 1. How to use multi-gpu for training/validation?
+In `PaddleViT`, multi-gpu is easy and straightforward to use. Typically, you will have a script file (e.g., `run_train_multi.sh`) to start your experiment. This `.sh` script runs the python file (e.g., `main_multi_gpu.py`) with commandline options. For example, a validation script `run_eval_multi.sh` calls the `main_multi_gpu.py` with a number of arguments:
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/vit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=16 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./vit_base_patch16_224' \
+```
+In this shell script:
+- `CUDA_VISIBLE_DEVICES` sets which gpus will be used.
+- `batch_size` sets the batch_size on a **single** GPU.
+
+You can run this shell script to start your experiment, e.g.:
+```
+$ sh run_train_multi.sh
+```
+
+### 2. How does the multi-gpu schemes implement in PaddleViT?
+#### STEP 0: Preparation
+We use `paddle.distributed` package in `PaddleViT`:
+```python
+import paddle.distributed as distt
+```
+
+We present the basic concepts and steps of train/validate on multi-gpus:
+- Multiple subprocesses are launched.
+- Each process runs on 1 single GPU.
+- Each process runs its own training/validation.
+- Dataset is splitted, each process handles one part of the whole dataset.
+- On each GPU, forward is applied on its own batch data.
+- Gradients on each GPU are collected and averaged (all_reduced).
+- Averaged gradients are synced on each GPU for each iteration.
+- Gradient descent is applied on each GPU using the averaged gradients.
+- Validation results are collected across all GPUs.
+- Communication between GPUs are based on `NCCL2`.
+
+
+#### STEP 1: Create `main` method
+Define a `main` method contains the following steps:
+1. Create the `dataset` and `dataloader`. (see STEP2)
+2. Get and set the number of GPUs to use.
+3. Launch multi-processing for multi-gpu training/validation
+
+The `main` method could be similar as:
+```python
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+```
+Where
+- `paddle.static.cuda_places()` gets all the availabe GPUs in current env.
+- `dist.spawn` launches `multiprocessing`
+- `main_worker` contains the full training/validation procedures.
+- `args` sends datasets to all the subprocesses.
+- `nprocs` determines the number of subprocesses to launch, set this to the number of GPUs.
+
+#### STEP 2: Create `dataset` and `dataloader`
+1. Dataset
+
+    `dataset` is defined in the same way as using single-GPU. Typically, you create a dataset class which implements `paddle.io.Dataset`. The `__getitem__` and `__len__` methods are required to implement, for reading the data and get the total length of the whole dataset.
+
+    In our multi-gpu scheme, we create a single `dataset` in the main process which will pass (as arguments) to all the subprocesses by `args` in `dist.spawn` method.
+2. Dataloader
+
+    `dataloader` defineds how to load the batch data, you can create a `paddle.io.DataLoader` with a `paddle.io.Dataset` and a `DistributedBatchSampler` as its inputs. Other commonly used input parameters are `batch_size(int)`, `shuffle(bool)` and `collate_fn`.
+    
+    For multi-gpu scheme, the `DistributedBatchSampler` is used to split the dataset into `num_replicas` and sample batch data for each process/GPU (`rank`).  For example:
+    ```python
+    sampler = DistributedBatchSampler(dataset,
+                                    batch_size=batch_size,
+                                    shuffle=(mode == 'train'))
+    ```
+    The dataloader is initialized in each process (which means you will initialize the instance in `main_worker` method), the `num_replicas` and `rank` will be automated determined by the distributed env. 
+
+#### STEP 3: Multi-GPU Training
+In STEP1, the first argment in `dist.spawn` is `main_worker`, which is a method contains the full training/validation procedures. You can think the `main` method is run on the main process(master), which launches a number of subprocesses(workers). These subprocesses run the contents defined in `main_worker`.
+
+Specifically, in the `main_worker` we have:
+1. Init distributed env: `dist.init_paralel_env()`
+2. (Optional) Get world-size: `dist.get_world_size()`
+3. (Optional) Get current rank: `dist.get_rank()`
+4. Make the model ready for multi-gpu: `model=paddle.DataParallel(model)`
+5. Get dataloader with `DistributedBatchSampler`
+6. Training (same as using single-gpu)
+
+#### STEP 4: Multi-GPU Validation
+In `main_worker` for validation, we will have:
+1. Init distributed env: `dist.init_paralel_env()`
+2. Make the model ready for multi-gpu: `model=paddle.DataParallel(model)`
+5. Get dataloader with `DistributedBatchSampler`
+4. Validation(same as single-gpu)
+5. For each iteration, **gather the results across all GPUS**
+
+Since each process/GPU runs inference for its own batch data, we must gather these results to get the overall performance. In paddle, `paddle.distributed.all_reduce` gathers the tensor across GPUs, which can be called in each iteration:
+```python
+output, _ = model(image) # inference
+loss = criterion(output, label) # get loss
+
+pred = F.softmax(output) # get perds
+acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1)) # top1 acc
+ 
+dist.all_reduce(loss) # gather loss from all GPUs
+dist.all_reduce(acc1) # gather top1 acc from all GPUS
+ 
+loss = loss / dist.get_world_size() # get average loss 
+acc1 = acc1 / dist.get_world_size() # get average top1 acc
+```
+Note that default `all_reduce` returns the `SUM` of the tensor values across GPUs, therefore we divided the `world_size` to get the average.
+
+Finally, the `AverageMeter` can be used to log the results as using single-gpu:
+```python
+batch_size = paddle.to_tensor(image.shape[0])
+dist.all_reduce(batch_size)
+val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+```
+
+### 3. Advanced functions
+For developers who needs advanced communication/data transfer between GPUs in `PaddleViT`, we hacked two methods for `reduce` dict objects, and `gather` any (picklable) object rather than only `paddle.Tensor`.
+
+Specifically:
+
+- `reduce_dict(input_dict, average=True)` is defined to take an `dict` stores the key: tensor pairs, and if `average` is set to `True`, the `all_reduce` will conduct `average` by world size, on each value in the dict. If `average` is `False`, the regular `sum` operation will be applied on each value in the dict.
+
+- `all_gather(data)` is defined to `all_gather` any pickable data, rather than only `paddle.Tensor`. The input is a data object, the output is a list of gathered data from each rank.
+
+> Detailed implementations can be found in PaddleVIT `object_detection/DETR/utils.py`
diff --git a/docs/paddlevit-port-weights.md b/docs/paddlevit-port-weights.md
new file mode 100644
index 00000000..a6912d4b
--- /dev/null
+++ b/docs/paddlevit-port-weights.md
@@ -0,0 +1,91 @@
+## PaddleViT: How to port model from Pytorch to Paddle?
+> Sample code: [here](../image_classification/ViT/load_pytorch_weights.py)
+
+### Step 0:
+We assume you are trying to implement your Paddle version of some ViT model, from some PyTorch implementations. You want to port the pretrained weights from pytorch `.pth` file to paddle `.pdparams` file.
+
+So we now have:
+- One `torch.nn.Module` class implements the model in pytorch.
+- One `.pth` pretrained weight file corresponding to the pytorch model.
+- One `paddle.nn.Layer` class we implemented the same model in paddle.
+
+> Note: the `paddle.nn.Layer` class must implemented in the similar way of your refferred `torch.nn.Module`. Here 'similar' means the param sizes, tensor shapes, and compute logics are the same, while the name of the layers/params or the detailed implementations could be different.
+
+We still need to implement:
+- `load_pytorch_weights.py`, contains the methods and name mappings for model conversion.
+
+Now we show how to implement `load_pytorch_weights.py`.
+
+### Step 1:
+ Load your paddle model, e.g.:
+ ```python
+ paddle_model = build_model(config)
+ paddle_model.eval()
+ ```
+ You can just init a model class to build a model object, please refer to our PPViT code for detailed model definitions and usage of `config`.
+
+
+### Step 2:
+ Load your pytorch model with pretrained weights. 
+ 
+ For example, if we use models from `timm` project:
+ ```python
+ import timm
+ torch_model = timm.create_model('vit_base_patch16_224', pretrained=True)
+ torch_model.eval()
+ ```
+> timm: https://github.com/rwightman/pytorch-image-models
+
+### Step 3:
+ Check the name mappings (**manually**).
+ In `torch_to_paddle_mapping` method, you create a list of string tuples defines the corresponding param and buffer names for torch and paddle models. E.g.:
+- In **torch** model one param with name `patch_embed.proj.weight` 
+- In **paddle** model, same param is named `embeddings.patch_embeddings.weight`
+Then you have a tuple `(patch_embed.proj.weight, embeddings.patch_embeddings.weight)` saved in the mapping list.
+
+ > NOTE: You can use **for loop** and **prefix strings** to semi-automate your name mapping process.
+
+ > NOTE: Do NOT forget to add name mappings for `model.named_buffers()`
+
+Usually I copy the printed torch param/buffer names and shapes, and printed paddle param/buffer names and shapes, each in an individual text file, then check the mapping line by line and modify the `torch_to_paddle_mapping` if necessary.
+
+If all the name mappings are correct, run the conversion by:
+```python
+paddle_model = convert(torch_model, paddle_model)
+```
+> This method will convert the param weights from torch to the proper format, and then set the value to corresponding paddle params. The returned object is the paddle model obbject with pretrained weights same as pytorch model.
+
+> In `convert` method, weights of `torch.nn.Linear` is applied a `transpose`, to match the weights shape of `paddle.nn.Linear`.
+### Step 4:
+Check correctness. 
+Create a batch data corresponding to the mode input, e.g. :
+```python
+# check correctness
+x = np.random.randn(2, 3, 224, 224).astype('float32')
+x_paddle = paddle.to_tensor(x)
+x_torch = torch.Tensor(x).to(device)
+```
+Then do inference and convert output into numpy array:
+```
+out_torch = torch_model(x_torch)
+out_paddle = paddle_model(x_paddle)
+
+out_torch = out_torch.data.cpu().numpy()
+out_paddle = out_paddle.cpu().numpy()
+```
+Finally, check if the outputs are same for `paddle_model` and `torch_model`:
+```python
+assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+```
+
+### Step 5:
+Save model weights for paddle:
+```python
+paddle.save(paddle_model.state_dict(), model_path)
+```
+
+> **Tips:**
+> - BN layers usually have buffers such as `_mean`, and `_variance`
+> - Do not forget customized buffer defined in model, e.g., `paddle.register_buffer()`
+> - Use batched data  (batchsize > 1) to test results.
+> - Some params are 2-D but non Linear params, so `_set_value` must set `transpose=False`.
diff --git a/gan/README.md b/gan/README.md
new file mode 100644
index 00000000..91a09b54
--- /dev/null
+++ b/gan/README.md
@@ -0,0 +1,111 @@
+# PaddleViT-GAN: Visual Transformer Models for GAN
+PaddlePaddle training/validation code and pretrained models for **GAN**.
+
+This implementation is part of [PaddleViT](https://github.com/BR-IDL/PaddleViT) project.
+
+## Update 
+Update (2021-08-25): Init readme uploaded.
+
+## Quick Start
+
+ The following links are provided for the code and detail usage of each model architecture:
+1. **[Styleformer](./Styleformer)**
+2. **[TransGAN](./transGAN)**
+
+
+## Installation
+This module is tested on Python3.6+, and PaddlePaddle 2.1.0+. Most dependencies are installed by PaddlePaddle installation. You only need to install the following packages:
+```shell
+pip install yacs yaml lmdb
+```
+Then download the github repo:
+```shell
+git clone https://github.com/xperzy/PPViT.git
+cd PPViT/image_classification
+```
+
+## Basic Usage
+### Data Preparation
+**Cifar10**, **STL10**, **Celeba** and **LSUNchurch** datasets are used in the following folder structure:
+#### [Cifar10](https://www.cs.toronto.edu/~kriz/cifar.html):
+   
+   We use `paddle.io.Dataset.Cifar10` to crate the Cifar10 dataset, download and prepare the data manually is NOT needed.
+#### [STL10](https://cs.stanford.edu/~acoates/stl10/):
+```
+│STL10/
+├── train_X.bin
+│── train_y.bin
+├── test_X.bin
+│── test_y.bin
+│── unlabeled.bin
+```
+#### [CelebA](https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html):
+```
+│Celeba/
+├──img_align_celeba/
+│  ├── 000017.jpg
+│  │── 000019.jpg
+│  ├── 000026.jpg
+│  │── ......
+```
+#### [LSUN-church](https://www.yf.io/p/lsun):
+```
+│LSUNchurch/
+├──church_outdoor_train_lmdb/
+│  ├── data.mdb
+│  │── lock.mdb
+```
+### Demo Example
+For specific model example, go to the model folder, download the pretrained weight file, e.g., `./cifar10.pdparams`, to use the `styleformer_cifar10` model in python:
+```python
+from config import get_config
+from generator import Generator
+# config files in ./configs/
+config = get_config('./configs/styleformer_cifar10.yaml')
+# build model
+model = Generator(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./cifar10')
+model.set_dict(model_state_dict)
+```
+
+### Generate Sample Images
+To generate sample images from pretrained models, download the pretrained weights, and run the following script using command line:
+```shell
+sh run_generate.sh
+```
+or 
+```shell
+python generate.py \
+  -cfg='./configs/styleformer_cifar10.yaml' \
+  -num_out_images=16 \
+  -out_folder='./images_cifar10' \
+  -pretrained='./cifar10.pdparams'
+```
+The output images are stored in `-out_folder` path.
+
+> :robot: See the README file in each model folder for detailed usages.
+
+## Basic Concepts
+PaddleViT image classification module is developed in separate folders for each model with similar structure. Each implementation is around 3 type of classes and 2 types of scripts:
+1. **Model classes** such as **[ViT_custom.py](./transGAN/models/ViT_custom.py)**, in which the core *transformer model* and related methods are defined.
+   
+2. **Dataset classes** such as **[dataset.py](./gan/transGAN/datasets.py)**, in which the dataset, dataloader, data transforms are defined. We provided flexible implementations for you to customize the data loading scheme. Both single GPU and multi-GPU loading are supported.
+   
+3. **Config classes** such as **[config.py](./gan/transGAN/config.py)**, in which the model and training/validation configurations are defined. Usually, you don't need to change the items in the configuration, we provide updating configs by python `arguments` or `.yaml` config file. You can see [here](../docs/ppvit-config.md) for details of our configuration design and usage.
+   
+4. **main scripts** such as **[main_single_gpu.py](./transGAN/main_single_gpu.py)**, in which the whole training/validation procedures are defined. The major steps of training or validation are provided, such as logging, loading/saving models, finetuning, etc. Multi-GPU is also supported and implemented in separate python script `main_multi_gpu.py`.
+   
+5. **run scripts** such as **[run_eval_cifar.sh](./transGAN/run_eval_cifar.sh)**, in which the shell command for running python script with specific configs and arguments are defined.
+   
+
+## Model Architectures
+
+PaddleViT now provides the following **transfomer based models**:
+1. **[TransGAN](./transGAN)** (from Seoul National University and NUUA), released with paper [TransGAN: Two Pure Transformers Can Make One Strong GAN, and That Can Scale Up](https://arxiv.org/abs/2102.07074), by Yifan Jiang, Shiyu Chang, Zhangyang Wang.
+2. **[Styleformer](./Styleformer)** (from Facebook and Sorbonne), released with paper [Styleformer: Transformer based Generative Adversarial Networks with Style Vector](https://arxiv.org/abs/2106.07023), by Jeeseung Park, Younggeun Kim.
+
+
+
+## Contact
+If you have any questions, please create an [issue](https://github.com/BR-IDL/PaddleViT/issues) on our Github.
diff --git a/gan/Styleformer/README.md b/gan/Styleformer/README.md
new file mode 100644
index 00000000..39d93cba
--- /dev/null
+++ b/gan/Styleformer/README.md
@@ -0,0 +1,192 @@
+# Styleformer: Transformer based Generative Adversarial Networks with Style Vector, [arxiv](https://arxiv.org/abs/2106.07023v2) 
+
+PaddlePaddle training/validation code and pretrained models for **Styleformer**.
+
+The official pytorch implementation is [here](https://github.com/Jeeseung-Park/Styleformer).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT).
+
+
+
+<img src="./Styleformer.jpg" alt="drawing" width="100%" height="100%"/>
+<figcaption align = "center">Styleformer Model Overview</figcaption>
+
+### Update 
+Update (2021-08-17): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | FID | Image Size | Crop_pct | Interpolation | Model        |
+|--------------------------------|-----|------------|----------|---------------|--------------|
+| styleformer_cifar10            |2.73 | 32         | 1.0      | lanczos       |[google](https://drive.google.com/file/d/1iW76QmwbYz6GeAPQn8vKvsG0GvFdhV4T/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1Ax7BNEr1T19vgVjXG3rW7g)(ztky)  |
+| styleformer_stl10              |15.65| 48         | 1.0      | lanczos       |[google](https://drive.google.com/file/d/15p785y9eP1TeoqUcHPbwFPh98WNof7nw/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1rSORxMYAiGkLQZ4zTA2jcg)(i973)|
+| styleformer_celeba             |3.32 | 64         | 1.0      | lanczos       |[google](https://drive.google.com/file/d/1_YauwZN1osvINCboVk2VJMscrf-8KlQc/view?usp=sharing)/[baidu](https://pan.baidu.com/s/16NetcPxLQF9C_Zlp1SpkLw)(fh5s) |
+| styleformer_lsun               | 9.68 | 128        | 1.0      | lanczos       |[google](https://drive.google.com/file/d/1i5kNzWK04ippFSmrmcAPMItkO0OFukTd/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1jTS9ExAMz5H2lhue4NMV2A)(158t)|
+> *The results are evaluated on Cifar10, STL10, Celeba and LSUNchurch dataset, using **fid50k_full** metric.
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- lmdb>=1.2.1
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+STL10, Celeba and LSUNchurch dataset is used in the following folder structure:
+```
+│STL10/
+├── train_X.bin
+│── train_y.bin
+├── test_X.bin
+│── test_y.bin
+│── unlabeled.bin
+```
+```
+│Celeba/
+├──img_align_celeba/
+│  ├── 000017.jpg
+│  │── 000019.jpg
+│  ├── 000026.jpg
+│  │── unlabeled.bin
+│  │── ......
+```
+```
+│LSUNchurch/
+├──church_outdoor_train_lmdb/
+│  ├── data.mdb
+│  │── lock.mdb
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./cifar10.pdparams`, to use the `styleformer_cifar10` model in python:
+```python
+from config import get_config
+from generator import Generator
+# config files in ./configs/
+config = get_config('./configs/styleformer_cifar10.yaml')
+# build model
+model = Generator(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./cifar10')
+model.set_dict(model_state_dict)
+```
+
+## Generate Sample Images
+To generate sample images from pretrained models, download the pretrained weights, and run the following script using command line:
+```shell
+sh run_generate.sh
+```
+or 
+```shell
+python generate.py \
+  -cfg='./configs/styleformer_cifar10.yaml' \
+  -num_out_images=16 \
+  -out_folder='./images_cifar10' \
+  -pretrained='./cifar10.pdparams'
+```
+The output images are stored in `-out_folder` path.
+
+
+## Evaluation
+To evaluate Styleformer model performance on Cifar10 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/styleformer_cifar10.yaml' \
+    -dataset='cifar10' \
+    -batch_size=32 \
+    -eval \
+    -pretrained='./cifar10'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_single_gpu.py \
+    -cfg='./configs/styleformer_cifar10.yaml' \
+    -dataset='cifar10' \
+    -batch_size=32 \
+    -eval \
+    -pretrained='./cifar10'
+```
+
+</details>
+
+
+## Training
+To train the Styleformer Transformer model on Cifar10 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/styleformer_cifar10.yaml' \
+    -dataset='cifar10' \
+    -batch_size=32 \
+    -pretrained='./cifar10'
+```
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python main_single_gpu.py \
+    -cfg='./configs/styleformer_cifar10.yaml' \
+    -dataset='cifar10' \
+    -batch_size=32 \
+    -pretrained='./cifar10'
+```
+
+</details>
+
+
+## Visualization of Generated Images
+### Generated Images after Training
+<img src="./fig2.png" alt="drawing" width="60%" height="60%"/>
+<figcaption align = "center">Generated Images from CelebA(left) and LSUN-church(right) datasets</figcaption>
+
+### Generated Images during Training 
+**(coming soon)**
+
+## Reference
+```
+@article{park2021styleformer,
+      title={Styleformer: Transformer based Generative Adversarial Networks with Style Vector}, 
+      author={Jeeseung Park and Younggeun Kim},
+      year={2021},
+      eprint={2106.07023},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
diff --git a/gan/Styleformer/Styleformer.jpg b/gan/Styleformer/Styleformer.jpg
new file mode 100644
index 00000000..df8f1304
Binary files /dev/null and b/gan/Styleformer/Styleformer.jpg differ
diff --git a/gan/Styleformer/celeba_dataset.py b/gan/Styleformer/celeba_dataset.py
new file mode 100644
index 00000000..5065b02b
--- /dev/null
+++ b/gan/Styleformer/celeba_dataset.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+CelebA Dataset related classes and methods
+Currently only support for GAN
+"""
+
+import os
+import glob
+from PIL import Image
+from paddle.io import Dataset
+
+class CelebADataset(Dataset):
+    """Build CelebA dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where align and cropped images are stored
+        transform: preprocessing ops to apply on image
+    """
+
+    def __init__(self, file_folder, transform=None):
+        """CelebA Dataset with dataset file path, and transform"""
+        super().__init__()
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = glob.glob(os.path.join(file_folder, '*.jpg'))
+        print(f'----- CelebA img_align len = {len(self.img_path_list)}')
+
+    def __len__(self):
+        return len(self.img_path_list)
+
+    def __getitem__(self, index):
+        img = Image.open(self.img_path_list[index]).convert('RGB')
+        if self.transform is not None:
+            img = self.transform(img)
+        label = 0
+        return img, label
+
+#if __name__ == "__main__":
+#    dataset = CelebADataset(file_folder='./celeba/img_align_celeba')
+#    for idx, (data, label) in enumerate(dataset):
+#        print(idx)
+#        print(data.size)
+#        print('-----')
+#        if idx == 10:
+#            break
diff --git a/gan/Styleformer/config.py b/gan/Styleformer/config.py
new file mode 100644
index 00000000..cd37679d
--- /dev/null
+++ b/gan/Styleformer/config.py
@@ -0,0 +1,169 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+"""
+
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 32 #1024 batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 32 #1024 batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/cifar10/' # path to dataset
+_C.DATA.DATASET = 'cifar10' # dataset name
+_C.DATA.IMAGE_SIZE = 32 # input image size
+_C.DATA.CHANNEL = 3 # input image channel
+_C.DATA.CROP_PCT = 1.0 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads
+_C.DATA.MAX_REAL_NUM = None # number of images used in the dataset (real images)
+_C.DATA.MAX_GEN_NUM = None # number of images used in the generator (fake images)
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'Styleformer'
+_C.MODEL.NAME = 'Styleformer_Large'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 10
+_C.MODEL.DROPOUT = 0.0
+_C.MODEL.ATTENTION_DROPOUT = 0.0
+_C.MODEL.DROP_PATH = 0.1
+
+# transformer settings
+_C.MODEL.GEN = CN()
+_C.MODEL.GEN.RESOLUTION = 12
+_C.MODEL.GEN.Z_DIM = 512
+_C.MODEL.GEN.C_DIM = 0
+_C.MODEL.GEN.W_DIM = 512
+_C.MODEL.GEN.DEPTH = 32
+_C.MODEL.GEN.NUM_LAYERS = [1,3,3]
+_C.MODEL.GEN.G_DICT = [1024,512,512]
+_C.MODEL.GEN.LINFORMER = False
+_C.MODEL.DIS = CN()
+_C.MODEL.DIS.CHANNEL_MUTIPLIER = 2
+_C.MODEL.DIS.BLUR_KERNEL = [1, 3, 3, 1]
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 20
+_C.TRAIN.WEIGHT_DECAY = 0.05
+_C.TRAIN.BASE_LR = 0.002
+_C.TRAIN.WARMUP_START_LR = 0.0
+_C.TRAIN.END_LR = 0.0
+_C.TRAIN.GRAD_CLIP = 1.0
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'Adam'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0, 0.99)
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# augmentation
+_C.AUG = CN()
+_C.AUG.COLOR_JITTER = 0.4 # color jitter factor
+_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1'
+_C.AUG.RE_PROB = 0.25 # random earse prob
+_C.AUG.RE_MODE = 'pixel' # random earse mode
+_C.AUG.RE_COUNT = 1 # random earse count
+_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0
+_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0
+_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha
+_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled
+_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled
+_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem'
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 1 # freq to save chpt
+_C.REPORT_FREQ = 32 # freq to logging info
+_C.VALIDATE_FREQ = 20 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    # if args.resume:
+    #     config.MODEL.RESUME = args.resume
+    # if args.last_epoch:
+    #     config.MODEL.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/gan/Styleformer/configs/styleformer_celeba.yaml b/gan/Styleformer/configs/styleformer_celeba.yaml
new file mode 100644
index 00000000..5ae61fcc
--- /dev/null
+++ b/gan/Styleformer/configs/styleformer_celeba.yaml
@@ -0,0 +1,13 @@
+DATA:
+    IMAGE_SIZE: 64
+    MAX_GEN_NUM: 50000
+    MAX_REAL_NUM: None
+MODEL:
+    TYPE: Styleformer
+    NAME: Styleformer_Linformer
+    NUM_CLASSES: 10177
+    GEN:
+        RESOLUTION: 8
+        NUM_LAYERS: [1,2,1,1]
+        G_DICT: [1024,256,64,64]
+        LINFORMER: True
diff --git a/gan/Styleformer/configs/styleformer_cifar10.yaml b/gan/Styleformer/configs/styleformer_cifar10.yaml
new file mode 100644
index 00000000..37c6bea8
--- /dev/null
+++ b/gan/Styleformer/configs/styleformer_cifar10.yaml
@@ -0,0 +1,13 @@
+DATA:
+    IMAGE_SIZE: 32
+    MAX_GEN_NUM: 50000
+    MAX_REAL_NUM: None
+MODEL:
+    TYPE: Styleformer
+    NAME: Styleformer_Large
+    NUM_CLASSES: 10
+    GEN:
+        RESOLUTION: 8
+        NUM_LAYERS: [1,3,3]
+        G_DICT: [1024,512,512]
+
diff --git a/gan/Styleformer/configs/styleformer_lsun.yaml b/gan/Styleformer/configs/styleformer_lsun.yaml
new file mode 100644
index 00000000..83a69da6
--- /dev/null
+++ b/gan/Styleformer/configs/styleformer_lsun.yaml
@@ -0,0 +1,13 @@
+DATA:
+    IMAGE_SIZE: 128
+    MAX_GEN_NUM: 50000
+    MAX_REAL_NUM: None
+MODEL:
+    TYPE: Styleformer
+    NAME: Styleformer_Linformer
+    NUM_CLASSES: 1
+    GEN:
+        RESOLUTION: 8
+        NUM_LAYERS: [1,2,1,1,1]
+        G_DICT: [1024,256,64,64,64]
+        LINFORMER: True
diff --git a/gan/Styleformer/configs/styleformer_stl10.yaml b/gan/Styleformer/configs/styleformer_stl10.yaml
new file mode 100644
index 00000000..66dd9fe1
--- /dev/null
+++ b/gan/Styleformer/configs/styleformer_stl10.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 48
+    MAX_GEN_NUM: 50000
+    MAX_REAL_NUM: None
+MODEL:
+    TYPE: Styleformer
+    NAME: Styleformer_Medium
+    NUM_CLASSES: 1 # unlabeled data, all class 0
+    GEN:
+        RESOLUTION: 12
+        NUM_LAYERS: [1,3,3]
+        G_DICT: [1024,256,64]
+
+
diff --git a/gan/Styleformer/datasets.py b/gan/Styleformer/datasets.py
new file mode 100644
index 00000000..7e065c9c
--- /dev/null
+++ b/gan/Styleformer/datasets.py
@@ -0,0 +1,211 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for Styleformer training and validation
+Cifar10, STL10Dataset, LSUNchurch, Celeba and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.vision import transforms
+from paddle.vision import datasets
+from paddle.vision import image_load
+from stl10_dataset import STL10Dataset
+from lsun_church_dataset import LSUNchurchDataset
+from celeba_dataset import CelebADataset
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_train = transforms.Compose([
+        transforms.Resize(scale_size, interpolation='lanczos'),
+        transforms.ToTensor(),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize([scale_size, scale_size], interpolation='lanczos'),
+        transforms.ToTensor(),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'train'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "stl10":
+        if mode == 'train':
+            dataset = STL10Dataset(file_folder=config.DATA.DATA_PATH,
+                                   mode=mode,
+                                   transform=get_train_transforms(config))
+        else:
+            #mode = 'test'
+            mode = 'unlabeled'
+            dataset = STL10Dataset(file_folder=config.DATA.DATA_PATH,
+                                   mode=mode,
+                                   transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "lsun":
+        if mode == 'train':
+            dataset = LSUNchurchDataset(file_folder=config.DATA.DATA_PATH,
+                                        mode=mode,
+                                        transform=get_train_transforms(config))
+        else:
+            dataset = LSUNchurchDataset(file_folder=config.DATA.DATA_PATH,
+                                        mode=mode,
+                                        transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "celeba":
+        if mode == 'train':
+            dataset = CelebADataset(file_folder=config.DATA.DATA_PATH,
+                                    transform=get_train_transforms(config))
+        else:
+            dataset = CelebADataset(file_folder=config.DATA.DATA_PATH,
+                                    transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "Only support cifar10, cifar100, imagenet2012, celeba, stl10, lsun")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/gan/Styleformer/discriminator.py b/gan/Styleformer/discriminator.py
new file mode 100644
index 00000000..d54e0829
--- /dev/null
+++ b/gan/Styleformer/discriminator.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from utils.equalized import EqualLinear, EqualConv2D
+from utils.fused_act import FusedLeakyReLU
+from utils.upfirdn2d import Upfirdn2dBlur
+
+
+class ConvLayer(nn.Sequential):
+    def __init__(
+        self,
+        in_channel,
+        out_channel,
+        kernel_size,
+        downsample=False,
+        blur_kernel=[1, 3, 3, 1],
+        bias=True,
+        activate=True,
+    ):
+        layers = []
+
+        if downsample:
+            factor = 2
+            p = (len(blur_kernel) - factor) + (kernel_size - 1)
+            pad0 = (p + 1) // 2
+            pad1 = p // 2
+
+            layers.append(Upfirdn2dBlur(blur_kernel, pad=(pad0, pad1)))
+
+            stride = 2
+            self.padding = 0
+
+        else:
+            stride = 1
+            self.padding = kernel_size // 2
+
+        layers.append(
+            EqualConv2D(
+                in_channel,
+                out_channel,
+                kernel_size,
+                padding=self.padding,
+                stride=stride,
+                bias=bias and not activate,
+            ))
+
+        if activate:
+            layers.append(FusedLeakyReLU(out_channel, bias=bias))
+
+        super().__init__(*layers)
+
+
+class ResBlock(nn.Layer):
+    def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1]):
+        super().__init__()
+
+        self.conv1 = ConvLayer(in_channel, in_channel, 3)
+        self.conv2 = ConvLayer(in_channel, out_channel, 3, downsample=True)
+
+        self.skip = ConvLayer(in_channel,
+                              out_channel,
+                              1,
+                              downsample=True,
+                              activate=False,
+                              bias=False)
+
+    def forward(self, input):
+        out = self.conv1(input)
+        out = self.conv2(out)
+
+        skip = self.skip(input)
+        out = (out + skip) / math.sqrt(2)
+
+        return out
+
+
+# temporally solve pow double grad problem
+def var(x, axis=None, unbiased=True, keepdim=False, name=None):
+
+    u = paddle.mean(x, axis, True, name)
+    out = paddle.sum((x - u) * (x - u), axis, keepdim=keepdim, name=name)
+
+    n = paddle.cast(paddle.numel(x), x.dtype) \
+        / paddle.cast(paddle.numel(out), x.dtype)
+    if unbiased:
+        one_const = paddle.ones([1], x.dtype)
+        n = paddle.where(n > one_const, n - 1., one_const)
+    out /= n
+    return out
+
+
+class StyleGANv2Discriminator(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.size = config.DATA.IMAGE_SIZE
+        self.channel_multiplier = config.MODEL.DIS.CHANNEL_MUTIPLIER
+        self.blur_kernel = config.MODEL.DIS.BLUR_KERNEL
+        channels = {
+            4: 512,
+            8: 512,
+            16: 512,
+            32: 512,
+            48: 512,
+            64: 256 * self.channel_multiplier,
+            128: 128 * self.channel_multiplier,
+            256: 64 * self.channel_multiplier,
+            512: 32 * self.channel_multiplier,
+            1024: 16 * self.channel_multiplier,
+        }
+
+        convs = [ConvLayer(3, channels[self.size], 1)]
+
+        log_size = int(math.log(self.size, 2))
+
+        in_channel = channels[self.size]
+
+        for i in range(log_size, 2, -1):
+            out_channel = channels[2**(i - 1)]
+
+            convs.append(ResBlock(in_channel, out_channel, self.blur_kernel))
+
+            in_channel = out_channel
+
+        self.convs = nn.Sequential(*convs)
+
+        self.stddev_group = 4
+        self.stddev_feat = 1
+
+        self.final_conv = ConvLayer(in_channel + 1, channels[4], 3)
+        self.final_linear = nn.Sequential(
+            EqualLinear(channels[4] * 4 * 4,
+                        channels[4],
+                        activation="fused_lrelu"),
+            EqualLinear(channels[4], 1),
+        )
+
+    def forward(self, input):
+        out = self.convs(input)
+
+        batch, channel, height, width = out.shape
+        group = min(batch, self.stddev_group)
+        stddev = out.reshape((group, -1, self.stddev_feat,
+                              channel // self.stddev_feat, height, width))
+        stddev = paddle.sqrt(var(stddev, 0, unbiased=False) + 1e-8)
+        stddev = stddev.mean([2, 3, 4], keepdim=True).squeeze(2)
+        stddev = stddev.tile((group, 1, height, width))
+        out = paddle.concat([out, stddev], 1)
+
+        out = self.final_conv(out)
+
+        out = out.reshape((batch, -1))
+        out = self.final_linear(out)
+
+        return out
diff --git a/gan/Styleformer/fig2.png b/gan/Styleformer/fig2.png
new file mode 100644
index 00000000..f4fbace7
Binary files /dev/null and b/gan/Styleformer/fig2.png differ
diff --git a/gan/Styleformer/generate.py b/gan/Styleformer/generate.py
new file mode 100644
index 00000000..6831d5b3
--- /dev/null
+++ b/gan/Styleformer/generate.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate images using trained models"""
+import argparse
+import os
+from PIL import Image
+import paddle
+from generator import Generator
+from config import get_config
+from config import update_config
+
+
+def main():
+    """ generate sample images using pretrained model
+    The following args are required:
+        -cfg: str, path of yaml model config file
+        -pretrained: str, path of the pretrained model (ends with .pdparams)
+        -num_out_images: int, the num of output images to be saved in file
+        -out_folder: str, output folder path.
+    """
+    paddle.set_device('gpu')
+    # get config
+    parser = argparse.ArgumentParser('Generate samples images')
+    parser.add_argument('-cfg', type=str, default='./configs/styleformer_cifar10.yaml')
+    parser.add_argument('-pretrained', type=str, default='./lsun.pdparams')
+    parser.add_argument('-num_out_images', type=int, default=16)
+    parser.add_argument('-out_folder', type=str, default='./out_images_lsun')
+
+    parser.add_argument('-dataset', type=str, default=None)
+    parser.add_argument('-batch_size', type=int, default=None)
+    parser.add_argument('-image_size', type=int, default=None)
+    parser.add_argument('-ngpus', type=int, default=None)
+    parser.add_argument('-data_path', type=str, default=None)
+    parser.add_argument('-eval', action="store_true")
+
+    args = parser.parse_args()
+    config = get_config()
+    config = update_config(config, args)
+    # get model
+    print(f'----- Creating model...')
+    paddle_model = Generator(config)
+    paddle_model.eval()
+    # load model weights
+    print(f'----- Loading model form {config.MODEL.PRETRAINED}...')
+    model_state_dict = paddle.load(config.MODEL.PRETRAINED)
+    paddle_model.load_dict(model_state_dict)
+    # get random input tensor
+    x_paddle = paddle.randn([args.num_out_images, paddle_model.z_dim])
+    # inference
+    print(f'----- Inferencing...')
+    out_paddle = paddle_model(
+        z=x_paddle, c=paddle.randint(0, config.MODEL.NUM_CLASSES, [args.num_out_images]))
+    # post processing to obtain image
+    print('----- Postprocessing')
+    gen_imgs = (out_paddle * 127.5 + 128).clip(0, 255)
+    gen_imgs = gen_imgs.transpose((0, 2, 3, 1)).astype('uint8')
+    gen_imgs = gen_imgs.cpu().numpy()
+    # save images to file
+    os.makedirs(args.out_folder, exist_ok=True)
+    print(f'----- Saving images to {args.out_folder}')
+    for i, gen_img in enumerate(gen_imgs):
+        img = Image.fromarray(gen_img, 'RGB')
+        out_path = os.path.join(args.out_folder, str(i) + '.png')
+        img.save(out_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gan/Styleformer/generator.py b/gan/Styleformer/generator.py
new file mode 100644
index 00000000..a9c79ea2
--- /dev/null
+++ b/gan/Styleformer/generator.py
@@ -0,0 +1,634 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from utils.upfirdn2d import setup_filter, Upfirdn2dUpsample
+from utils.fused_act import fused_leaky_relu
+
+
+def bias_act(x, b=None, dim=1, gain=None, clamp=None):
+    """Slow reference implementation of `bias_act()`
+    """
+    # spec = activation_funcs[act]
+    # alpha = float(alpha if alpha is not None else 0)
+    gain = float(gain if gain is not None else 1)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Add bias.
+    if b is not None:
+        x = x + b.reshape([-1 if i == dim else 1 for i in range(x.ndim)])
+
+    # Evaluate activation function.
+    # alpha = float(alpha)
+    # x = spec.func(x, alpha=alpha)
+
+    # Scale by gain.
+    gain = float(gain)
+    if gain != 1:
+        x = x * gain
+
+    # Clamp.
+    if clamp >= 0:
+        x = x.clip(-clamp, clamp) # pylint: disable=invalid-unary-operand-type
+    return x
+
+
+def normalize_2nd_moment(x, dim=-1, eps=1e-8):
+    return x * (x.square().mean(axis=dim, keepdim=True) + eps).rsqrt()
+
+
+def lerp(p0, p1, t):
+    """Linear interpolation."""
+    return (1.0 - t) * p0 + t * p1
+
+
+def modulated_style_mlp(x, weight, styles):
+    batch_size = x.shape[0]
+    channel = x.shape[1]
+    width = x.shape[2]
+    height = x.shape[3]
+    w = None
+    dcoefs = None
+
+    w = weight.unsqueeze(0)
+    w = w * styles.reshape([batch_size, 1, -1])
+    dcoefs = (w.square().sum(axis=[2]) + 1e-8).rsqrt()
+
+    x = x.reshape([batch_size, channel, width * height]).transpose([0, 2, 1])
+    x = x * paddle.to_tensor(styles, dtype='float32').reshape([batch_size, 1, -1])
+    x = paddle.matmul(x, weight.t())
+    x = x * paddle.to_tensor(dcoefs, dtype='float32').reshape([batch_size, 1, -1])
+    x = x.transpose([0, 2, 1]).reshape([batch_size, -1, width, height])
+
+    return x
+
+
+def modulated_channel_attention(x, q_weight, k_weight, v_weight, w_weight,
+    u_weight, proj_weight, styles, num_heads):
+    """Style modulation effect to the input.
+       input feature map is scaled through a style vector,
+       which is equivalent to scaling the linear weight.
+    """
+    batch_size = x.shape[0]
+    seq_length = x.shape[1]
+    hidden_dimension = x.shape[2]
+    depth = hidden_dimension // num_heads
+    attention_scale = paddle.to_tensor(depth ** -0.5, dtype='float32')
+
+    layernorm = nn.InstanceNorm1D(seq_length)
+
+    styles1 = styles[:, :hidden_dimension]
+    styles2 = styles[:, hidden_dimension:]
+
+    x = x * (styles1.reshape([batch_size, 1, -1]))
+    x = layernorm(x)
+
+    q = q_weight.unsqueeze(0)
+    q = q * styles1.reshape([batch_size, 1, -1])
+    q_dcoefs = (q.square().sum(axis=[2]) + 1e-8).rsqrt()
+
+    k = k_weight.unsqueeze(0)
+    k = k * styles1.reshape([batch_size, 1, -1])
+    k_dcoefs = (k.square().sum(axis=[2]) + 1e-8).rsqrt()
+
+    v = v_weight.unsqueeze(0)
+    v = v * styles1.reshape([batch_size, 1, -1])
+    v_dcoefs = (v.square().sum(axis=[2]) + 1e-8).rsqrt()
+
+    w = w_weight.unsqueeze(0)
+    w = w * styles2.reshape([batch_size, 1, -1])
+    w_dcoefs = (w.square().sum(axis=[2]) + 1e-8).rsqrt()
+
+    q_value = paddle.matmul(x, q_weight.t()) * q_dcoefs.reshape([batch_size, 1, -1])
+    q_value = q_value.reshape([batch_size, seq_length, num_heads, depth]).transpose([0, 2, 1, 3])
+    k_value = paddle.matmul(x, k_weight.t()) * k_dcoefs.reshape([batch_size, 1, -1])
+    k_value = k_value.reshape([batch_size, seq_length, num_heads, depth]).transpose([0, 2, 1, 3])
+    if proj_weight is not None:
+        k_value = paddle.matmul(k_value.transpose([0, 1, 3, 2]), 
+                                proj_weight.t()).transpose([0, 1, 3, 2])
+    v_value = paddle.matmul(x, v_weight.t())
+    v_value = v_value * v_dcoefs.reshape([batch_size, 1, -1])
+
+    v_value = v_value * styles2.reshape([batch_size, 1, -1])
+    skip = v_value
+    if proj_weight is not None:
+        v_value = paddle.matmul(v_value.transpose([0, 2, 1]), proj_weight.t())
+        v_value = v_value.transpose([0, 2, 1])
+        v_value = v_value.reshape([batch_size, 256, num_heads, depth]).transpose([0, 2, 1, 3])
+    else:
+        v_value = v_value.reshape([batch_size, seq_length, num_heads, depth])
+        v_value = v_value.transpose([0, 2, 1, 3])
+
+    attn = paddle.matmul(q_value, k_value.transpose([0, 1, 3, 2])) * attention_scale
+    revised_attn = attn
+    attn_score = F.softmax(revised_attn, axis=-1)
+
+    x = paddle.matmul(attn_score , v_value).transpose([0, 2, 1, 3])
+    x = x.reshape([batch_size, seq_length, hidden_dimension])
+    x = paddle.matmul(x, paddle.to_tensor(w_weight.t(), dtype='float32'))
+    x = x * paddle.to_tensor(w_dcoefs, dtype='float32').reshape([batch_size, 1, -1])
+
+    u = u_weight.unsqueeze(0)
+    u = u * styles2.reshape([batch_size, 1, -1])
+    u_dcoefs = (u.square().sum(axis=[2]) + 1e-8).rsqrt()
+
+    skip = paddle.matmul(skip, paddle.to_tensor(u_weight.t(), dtype='float32'))
+    skip = skip * paddle.to_tensor(u_dcoefs, dtype='float32').reshape([batch_size, 1, -1])
+
+    x = x + skip
+
+    return x
+
+
+class FullyConnectedLayer(nn.Layer):
+    """ FullyConnectedLayer
+
+    Attributes:
+        in_features: Number of input features.
+        out_features: Number of output features.
+        bias: Apply additive bias before the activation function
+        activation: Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier: Learning rate multiplier.
+        bias_init: Initial value for the additive bias.
+    """
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 bias=True,
+                 activation='linear',
+                 lr_multiplier=1,
+                 bias_init=0):
+        super().__init__()
+        self.activation = activation
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = self.create_parameter(
+            shape=[out_features, in_features],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Normal(std=1e-6))
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Constant(bias_init)) if bias else None
+        self.weight_gain = lr_multiplier / np.sqrt(in_features)
+        self.bias_gain = lr_multiplier
+
+    def forward(self, x):
+        w = paddle.to_tensor(self.weight, dtype='float32') * self.weight_gain
+        b = self.bias
+        if b is not None:
+            b = paddle.to_tensor(b, dtype='float32')
+            if self.bias_gain != 1:
+                b = b * self.bias_gain
+
+        if self.activation == 'linear' and b is not None:
+            x = paddle.addmm(b.unsqueeze(0), x, w.t())
+        else:
+            x = x.matmul(w.t())
+            x = fused_leaky_relu(x, b)
+        return x
+
+
+class MappingNetwork(nn.Layer):
+    """ MappingNetwork
+
+    Mapping networks learned affine transformations.
+
+    Attributes:
+        z_dim: Input latent (Z) dimensionality, 0 = no latent.
+        c_dim: Conditioning label (C) dimensionality, 0 = no label.
+        w_dim: Intermediate latent (W) dimensionality.
+        num_ws: Number of intermediate latents to output, None = do not broadcast.
+        num_layers: Number of mapping layers.
+        embed_features: Label embedding dimensionality, None = same as w_dim.
+        layer_features: Number of intermediate features in the mapping layers, None = same as w_dim.
+        activation: Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier: Learning rate multiplier for the mapping layers.
+        w_avg_beta: Decay for tracking the moving average of W during training, None = do not track.
+    """
+
+    def __init__(self,
+                 z_dim,
+                 c_dim,
+                 w_dim,
+                 num_ws,
+                 num_layers=2,
+                 embed_features=None,
+                 layer_features=None,
+                 activation='lrelu',
+                 lr_multiplier=0.01,
+                 w_avg_beta=0.995):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.num_ws = num_ws
+        self.num_layers = num_layers
+        self.w_avg_beta = w_avg_beta
+
+        if embed_features is None:
+            embed_features = w_dim
+        if c_dim == 0:
+            embed_features = 0
+        if layer_features is None:
+            layer_features = w_dim
+        features_list = [z_dim + embed_features] + [layer_features] * (num_layers - 1) + [w_dim]
+
+        if c_dim > 0:
+            self.embed = FullyConnectedLayer(c_dim, embed_features)
+        for idx in range(num_layers):
+            in_features = features_list[idx]
+            out_features = features_list[idx + 1]
+            layer = FullyConnectedLayer(in_features,
+                                        out_features,
+                                        activation=activation,
+                                        lr_multiplier=lr_multiplier)
+            setattr(self, f'fc{idx}', layer)
+
+        if num_ws is not None and w_avg_beta is not None:
+            self.register_buffer('w_avg', paddle.zeros([w_dim]))
+
+    def forward(self, z, c, truncation_psi=1, truncation_cutoff=None, skip_w_avg_update=False):
+        # Embed, normalize, and concat inputs.
+        x = None
+
+        if self.z_dim > 0:
+            x = normalize_2nd_moment(paddle.to_tensor(z, dtype='float32'))
+        if self.c_dim > 0:
+            y = normalize_2nd_moment(paddle.to_tensor(self.embed(c), dtype='float32'))
+            x = paddle.concat([x, y], axis=1) if x is not None else y
+
+        # Main layers.
+        for idx in range(self.num_layers):
+            layer = getattr(self, f'fc{idx}')
+            x = layer(x)
+
+        # Update moving average of W.
+        if self.w_avg_beta is not None and self.training and not skip_w_avg_update:
+            self.w_avg = (lerp(x.detach().mean(axis=0), self.w_avg, self.w_avg_beta))
+
+        # Broadcast.
+        if self.num_ws is not None:
+            x = x.unsqueeze(1).tile([1, self.num_ws, 1])
+
+        # Apply truncation.
+        if truncation_psi != 1:
+            if self.num_ws is None or truncation_cutoff is None:
+                x = lerp(self.w_avg, x, truncation_psi)
+            else:
+                x[:, :truncation_cutoff] = lerp(self.w_avg, x[:, :truncation_cutoff], truncation_psi)
+        return x
+
+
+class Encoderlayer(nn.Layer):
+    """ Encoderlayer"""
+    def __init__(self, h_dim, w_dim, out_dim, seq_length, depth, minimum_head, use_noise=True,
+        conv_clamp=None, proj_weight=None, channels_last=False):
+        super().__init__()
+        self.h_dim = h_dim
+        self.num_heads = max(minimum_head, h_dim // depth)
+        self.w_dim = w_dim
+        self.out_dim = out_dim
+        self.seq_length = seq_length
+        self.use_noise = use_noise
+        self.conv_clamp = conv_clamp
+        self.affine1 = FullyConnectedLayer(w_dim, h_dim * 2, bias_init=1)
+
+        # memory_format = paddle.channels_last if channels_last else paddle.contiguous_format
+        weight_min = -1./math.sqrt(h_dim)
+        weight_max = 1./math.sqrt(h_dim)
+        self.q_weight = self.create_parameter(
+            shape=[h_dim, h_dim],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Uniform(weight_min, weight_max))
+        self.k_weight = self.create_parameter(
+            shape=[h_dim, h_dim],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Uniform(weight_min, weight_max))
+        self.v_weight = self.create_parameter(
+            shape=[h_dim, h_dim],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Uniform(weight_min, weight_max))
+        self.w_weight = self.create_parameter(
+            shape=[out_dim, h_dim],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Uniform(weight_min, weight_max))
+
+        self.proj_weight = proj_weight
+        self.u_weight = self.create_parameter(
+            shape=[out_dim, h_dim],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Uniform(weight_min, weight_max))
+        if use_noise:
+            self.register_buffer('noise_const', paddle.randn([self.seq_length, 1]))
+            self.noise_strength = self.create_parameter(
+                shape=[1],
+                dtype='float32',
+                default_initializer=paddle.nn.initializer.Constant(0.0))
+        self.bias = self.create_parameter(
+            shape=[out_dim],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Constant(0.0))
+
+    def forward(self, x, w, noise_mode='random', gain=1):
+        styles1 = self.affine1(w)
+        noise = None
+
+        if self.use_noise and noise_mode == 'random':
+            noise = paddle.randn([x.shape[0], self.seq_length, 1]) * self.noise_strength[0]
+        if self.use_noise and noise_mode == 'const':
+            noise = self.noise_const * self.noise_strength[0]
+
+        x = modulated_channel_attention(x=x, q_weight=self.q_weight, k_weight=self.k_weight,
+            v_weight=self.v_weight, w_weight=self.w_weight, u_weight=self.u_weight,
+            proj_weight=self.proj_weight, styles=styles1, num_heads=self.num_heads)
+
+        if noise is not None:
+            x = x.add_(noise)
+
+        act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+
+        x = x + paddle.to_tensor(self.bias, dtype='float32')
+        x = F.leaky_relu(x, negative_slope=0.2)
+        x = paddle.clip(x, max=act_clamp, min=-act_clamp)
+
+        return x
+
+
+class ToRGBLayer(nn.Layer):
+    """ToRGBLayer
+
+    Convert reshaped output for each resolution into an RGB channel.
+
+    """
+
+    def __init__(self, in_channels, out_channels, w_dim, conv_clamp=None, channels_last=False):
+        super().__init__()
+        self.conv_clamp = None
+        self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1)
+        # memory_format = paddle.channels_last if channels_last else paddle.contiguous_format
+        self.weight = self.create_parameter(
+            shape=[out_channels, in_channels],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Uniform(
+                -1./math.sqrt(in_channels), 1./math.sqrt(in_channels)))
+        self.bias = self.create_parameter(
+            shape=[out_channels],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Constant(0.0))
+
+    def forward(self, x, w, fused_modconv=True):
+        styles = self.affine(w)
+        x = modulated_style_mlp(x=x, weight=self.weight, styles=styles)
+
+        x = bias_act(x, self.bias, clamp=self.conv_clamp)
+
+        return x
+
+
+class EncoderBlock(nn.Layer):
+    """EncoderBlock
+
+    Attributes:
+        w_dim: Intermediate latent (W) dimensionality.
+        img_resolution: int, size of image
+        img_channels: int, channel of input image
+    """
+
+    def __init__(self, h_dim, w_dim, out_dim, depth, minimum_head, img_resolution, resolution,
+        img_channels, is_first, is_last, init_resolution, architecture='skip', linformer=False,
+        conv_clamp=None, use_fp16=False, fp16_channels_last=False, resample_filter =[1,3,3,1],
+        scale_ratio=2):
+        super().__init__()
+        self.h_dim = h_dim
+        self.w_dim = w_dim
+        self.out_dim = out_dim
+        self.depth = depth
+        self.minimum_head = minimum_head
+        self.img_resolution = img_resolution
+        self.init_resolution = init_resolution
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.seq_length = resolution * resolution
+        self.is_first = is_first
+        self.is_last = is_last
+        self.architecture = architecture
+        self.use_fp16 = use_fp16
+        self.register_buffer('resample_filter', setup_filter(resample_filter))
+        self.channels_last = (use_fp16 and fp16_channels_last)
+        self.num_attention = 0
+        self.num_torgb = 0
+        self.scale_ratio = scale_ratio
+        self.conv_clamp = conv_clamp
+        self.proj_weight = None
+
+        # memory_format = paddle.contiguous_format
+
+        if self.resolution>=32 and linformer:
+            self.proj_weight = self.create_parameter(
+                shape=[256, self.seq_length],
+                dtype='float32',
+                default_initializer=paddle.nn.initializer.Uniform(
+                    -1./math.sqrt(self.seq_length), 1./math.sqrt(self.seq_length)))
+
+        if self.resolution == self.init_resolution and self.is_first:
+            self.const = self.create_parameter(
+                shape=[self.seq_length, self.h_dim],
+                dtype='float32',
+                default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))
+
+        if self.is_first:
+            self.pos_embedding = self.create_parameter(
+                shape=[1, self.seq_length, self.h_dim],
+                dtype='float32',
+                default_initializer=paddle.nn.initializer.Constant(0))
+
+        if not self.is_last or out_dim is None:
+            self.out_dim = h_dim
+
+        self.enc = Encoderlayer(h_dim=self.h_dim, w_dim=self.w_dim, out_dim=self.out_dim,
+            seq_length=self.seq_length, depth=self.depth, minimum_head=self.minimum_head,
+            conv_clamp=self.conv_clamp, proj_weight=self.proj_weight)
+        self.num_attention += 1
+
+        if self.is_last and self.architecture == 'skip':
+            self.torgb = ToRGBLayer(self.out_dim, self.img_channels, w_dim=w_dim,
+                                    conv_clamp=conv_clamp, channels_last=self.channels_last)
+            self.num_torgb += 1
+
+    def forward(self, x, img, ws, force_fp32=True, fused_modconv=None):
+        w_iter = iter(ws.unbind(axis=1))
+        # memory_format = paddle.channels_last if self.channels_last and not force_fp32 else paddle.contiguous_format
+        # if fused_modconv is None:
+        #     fused_modconv = (not self.training) and (fused_modconv.dtype == 'float32' or int(x.shape[0]) == 1)
+
+        #Input
+        if self.is_first and self.resolution == self.init_resolution:
+            x = paddle.to_tensor(self.const, dtype='float32')
+            x = x.unsqueeze(0).tile([ws.shape[0], 1, 1])
+        else:
+            x = paddle.to_tensor(x, dtype='float32')
+
+        #Main layers
+        if self.is_first:
+            x = x + self.pos_embedding
+
+        if self.architecture == 'resnet':
+            y = self.skip(x.transpose([0,2,1]).reshape(
+                [ws.shape[0], self.h_dim, self.resolution, self.resolution]))
+            x = self.enc(x, next(w_iter))
+            y = y.reshape([ws.shape[0], self.h_dim, self.seq_length])
+            x = y.add_(x)
+        else:
+            x = paddle.to_tensor(self.enc(x, next(w_iter)))
+
+        #ToRGB
+        if self.is_last:
+            if img is not None:
+                upsample2d = Upfirdn2dUpsample(self.resample_filter)
+                img = upsample2d(img)
+
+            if self.architecture == 'skip':
+                y = self.torgb(x.transpose([0,2,1]).reshape(
+                    [ws.shape[0], self.out_dim, self.resolution, self.resolution]),
+                    next(w_iter),
+                    fused_modconv=fused_modconv)
+                y = paddle.to_tensor(y, dtype='float32')
+                img = img.add_(y) if img is not None else y
+
+            #upsample
+            if self.resolution!=self.img_resolution:
+                upsample2d = Upfirdn2dUpsample(self.resample_filter)
+                x = upsample2d(x.transpose([0,2,1]).reshape([ws.shape[0],
+                               self.out_dim, self.resolution, self.resolution]))
+                x = x.reshape([ws.shape[0], self.out_dim, self.seq_length * self.scale_ratio **2])
+                x = x.transpose([0,2,1])
+
+        return x, img
+
+
+class SynthesisNetwork(nn.Layer):
+    """SynthesisNetwork
+
+    Attributes:
+        w_dim: Intermediate latent (W) dimensionality.
+        img_resolution: int, size of image
+        img_channels: int, channel of input image
+        num_block: int, Number of layers
+        num_ws: Number of intermediate latents to output, None = do not broadcast.
+    """
+    def __init__(self, w_dim, img_resolution, img_channels, depth, num_layers, G_dict,
+        linformer, init_resolution, minimum_head=1, conv_clamp=256, num_fp16_res=0):
+        super().__init__()
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.num_block = num_layers
+        self.linformer = linformer
+        if init_resolution==12:
+            self.block_resolutions = [3 * 2 ** i for i in range(2, self.img_resolution_log2)]
+        else:
+            self.block_resolutions = [2 ** i for i in range(3, self.img_resolution_log2 + 1)]
+
+        channels_dict = dict(zip(*[self.block_resolutions, G_dict]))
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+
+        self.num_ws = 0
+        for i, res in enumerate(self.block_resolutions):
+            h_dim = channels_dict[res]
+            out_dim = None
+            if res!=self.img_resolution:
+                out_dim = channels_dict[res*2]
+            use_fp16 = (res >= fp16_resolution)
+            num_block_res = self.num_block[i]
+            for j in range(num_block_res):
+                is_first = (j == 0)
+                is_last = (j == num_block_res - 1)
+                block = EncoderBlock(
+                    h_dim=h_dim, w_dim=w_dim, out_dim=out_dim, depth=depth,
+                    minimum_head=minimum_head, img_resolution=img_resolution,
+                    resolution=res, img_channels=img_channels, is_first=is_first,
+                    is_last=is_last, use_fp16=use_fp16, conv_clamp=conv_clamp,
+                    linformer=self.linformer, init_resolution=init_resolution)
+                self.num_ws += block.num_attention
+                if is_last:
+                    self.num_ws += block.num_torgb
+                setattr(self, f'b{res}_{j}', block)
+
+    def forward(self, ws=None):
+        block_ws = []
+        ws = paddle.to_tensor(ws, dtype='float32')
+        w_idx = 0
+        for i, res in enumerate(self.block_resolutions):
+            num_block_res = self.num_block[i]
+            res_ws = []
+            for j in range(num_block_res):
+                block = getattr(self, f'b{res}_{j}')
+                res_ws.append(ws.slice(axes=[1], starts=[w_idx],
+                              ends=[w_idx + block.num_attention + block.num_torgb]))
+                w_idx += block.num_attention
+            block_ws.append(res_ws)
+
+        x = img = None
+        for i, (res, cur_ws) in enumerate(zip(self.block_resolutions, block_ws)):
+            num_block_res = self.num_block[i]
+            for j in range(num_block_res):
+                block = getattr(self, f'b{res}_{j}')
+                x, img = block(x, img, cur_ws[j])
+
+        return img
+
+
+class Generator(nn.Layer):
+    """Generator class
+
+    Attributes:
+        z_dim: Input latent (Z) dimensionality, 0 = no latent.
+        c_dim: Conditioning label (C) dimensionality, 0 = no label.
+        w_dim: Intermediate latent (W) dimensionality.
+        img_resolution: int, size of image
+        img_channels: int, channel of input image
+        num_ws: Number of intermediate latents to output, None = do not broadcast.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.img_resolution = config.DATA.IMAGE_SIZE
+        self.img_channels = config.DATA.CHANNEL
+        self.z_dim = config.MODEL.GEN.Z_DIM
+        self.c_dim = config.MODEL.GEN.C_DIM
+        self.w_dim = config.MODEL.GEN.W_DIM
+        self.depth = config.MODEL.GEN.DEPTH
+        self.num_layers = config.MODEL.GEN.NUM_LAYERS
+        self.G_dict = config.MODEL.GEN.G_DICT
+        self.linformer = config.MODEL.GEN.LINFORMER
+        self.init_resolution = config.MODEL.GEN.RESOLUTION
+        self.synthesis = SynthesisNetwork(w_dim=self.w_dim, img_resolution=self.img_resolution,
+            depth=self.depth, num_layers=self.num_layers, G_dict=self.G_dict,
+            img_channels=self.img_channels, linformer=self.linformer,
+            init_resolution=self.init_resolution)
+        self.num_ws = self.synthesis.num_ws
+        self.mapping = MappingNetwork(z_dim=self.z_dim, c_dim=self.c_dim,
+                                      w_dim=self.w_dim, num_ws=self.num_ws)
+
+    def forward(self, z, c, truncation_psi=1, truncation_cutoff=None):
+        ws = self.mapping(z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff)
+        output = self.synthesis(ws)
+
+        return output
diff --git a/gan/Styleformer/load_pytorch_weights.py b/gan/Styleformer/load_pytorch_weights.py
new file mode 100644
index 00000000..917d0695
--- /dev/null
+++ b/gan/Styleformer/load_pytorch_weights.py
@@ -0,0 +1,212 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import numpy as np
+import paddle
+import torch
+from training.networks_Generator import *
+import legacy
+import dnnlib
+from generator import Generator
+from config import *
+
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/styleformer_cifar10.yaml')
+parser.add_argument('-dataset', type=str, default="cifar10")
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-data_path', type=str, default='/dataset/cifar10/')
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    sum=0
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+        sum=sum+1
+    print(sum)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    sum=0
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+        sum=sum+1
+    print(sum)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    resolution = config.MODEL.GEN.RESOLUTION
+    prefix = f'synthesis.b{resolution}_0'
+    mapping = [
+        (f'{prefix}.const', f'{prefix}.const'),
+    ]
+    num_layers = config.MODEL.GEN.NUM_LAYERS
+    # torch 'layers' to  paddle 'stages'
+    num_stages = len(num_layers)
+    linformer = config.MODEL.GEN.LINFORMER
+    i = 0
+    for i in range(num_stages):
+        stage_idx = 2**i * resolution
+        pp_s_prefix = f'synthesis.b{stage_idx}_'
+        th_s_prefix = f'synthesis.b{stage_idx}_'
+        mapping.extend([(f'{th_s_prefix}0.pos_embedding', f'{pp_s_prefix}0.pos_embedding')])
+
+        for block_idx in range(num_layers[i]):
+            th_b_prefix = f'{th_s_prefix}{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.enc.q_weight', f'{pp_b_prefix}.enc.q_weight'),
+                (f'{th_b_prefix}.enc.k_weight', f'{pp_b_prefix}.enc.k_weight'),
+                (f'{th_b_prefix}.enc.v_weight', f'{pp_b_prefix}.enc.v_weight'),
+                (f'{th_b_prefix}.enc.w_weight', f'{pp_b_prefix}.enc.w_weight'),
+                (f'{th_b_prefix}.enc.u_weight', f'{pp_b_prefix}.enc.u_weight'),
+                (f'{th_b_prefix}.enc.bias', f'{pp_b_prefix}.enc.bias'),
+                (f'{th_b_prefix}.enc.affine1.weight', f'{pp_b_prefix}.enc.affine1.weight'),
+                (f'{th_b_prefix}.enc.affine1.bias', f'{pp_b_prefix}.enc.affine1.bias'),
+                (f'{th_b_prefix}.resample_filter', f'{pp_b_prefix}.resample_filter'),
+                (f'{th_b_prefix}.enc.noise_const', f'{pp_b_prefix}.enc.noise_const'),
+                (f'{th_b_prefix}.enc.noise_strength', f'{pp_b_prefix}.enc.noise_strength'),
+            ]
+            if stage_idx>=32 and linformer:
+                mapping.extend([(f'{th_s_prefix}0.proj_weight', f'{pp_s_prefix}0.proj_weight')])
+            mapping.extend(layer_mapping)
+
+        mapping.extend([
+            (f'{th_b_prefix}.torgb.weight', f'{pp_b_prefix}.torgb.weight'),
+            (f'{th_b_prefix}.torgb.bias', f'{pp_b_prefix}.torgb.bias'),
+            (f'{th_b_prefix}.torgb.affine.weight', f'{pp_b_prefix}.torgb.affine.weight'),
+            (f'{th_b_prefix}.torgb.affine.bias', f'{pp_b_prefix}.torgb.affine.bias'),
+        ])
+        i = i + 1
+    mapping.extend([('mapping.fc0', 'mapping.fc0'),
+                    ('mapping.fc1', 'mapping.fc1'),
+                    ('mapping.w_avg', 'mapping.w_avg')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+        if value.shape == ():
+            value = value.reshape(1)
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, buff in paddle_model.named_buffers():
+        pd_params[name] = buff
+    for name, buff in torch_model.named_buffers():
+        th_params[name] = buff
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name, no_transpose=True)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = Generator(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    torch_model = Generator_torch(z_dim=512,c_dim=0,w_dim=512,img_resolution=32,img_channels=3)
+    with dnnlib.util.open_url(r'./Pretrained_CIFAR10.pkl') as f:
+        torch_model = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(32, 512).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch, c=torch.zeros(1))
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle, c=paddle.zeros([1]))
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+
+    # save weights for paddle model
+    model_path = os.path.join('./cifar10.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gan/Styleformer/lsun_church_dataset.py b/gan/Styleformer/lsun_church_dataset.py
new file mode 100644
index 00000000..e106b391
--- /dev/null
+++ b/gan/Styleformer/lsun_church_dataset.py
@@ -0,0 +1,124 @@
+ # Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+
+"""
+LSUN-church Dataset and related methods
+"""
+import os
+import io
+import numpy as np
+import lmdb
+from PIL import Image
+from paddle.io import Dataset
+
+
+class LSUNchurchDataset(Dataset):
+    """paddle dataset for loading LSUN-church binary data
+    This class will load the lmdb file from LSUN-church dataset,
+    extract and read images. Images are stored in list of numpy array
+
+    Args:
+        file_folder: str, folder path of LSUN-church dataset lmdb
+        mode: str, dataset mode, choose from ['train', 'val'], default: 'train'
+        transform: paddle.vision.transforms, transforms which is applied on data, default: None
+        max_num_images: int, num of images used in the dataset,
+        if None, use all the images, default: None
+    """
+    def __init__(self, file_folder, mode='train', transform=None, max_num_images=None):
+        super().__init__()
+        assert mode in ['train', 'val']
+        self.transform = transform
+        self.file_folder = file_folder
+        with lmdb.open(file_folder,
+                       map_size=1099511627776,
+                       max_readers=32,
+                       readonly=True,
+                       readahead=False,
+                       meminit=False,
+                       lock=False).begin(write=False) as txn:
+            self.num_images = txn.stat()['entries']
+            # efficient way of loading keys only
+            self.keys = list(txn.cursor().iternext(values=False))
+
+        self.txn = None
+        self.env = None
+
+        if max_num_images is not None:
+            self.num_images = min(self.num_images, max_num_images)
+
+        print(f'----- LSUN-church dataset {mode} len = {self.num_images}')
+
+    def open_lmdb(self):
+        """ Open lmdb, this method is called in __getitem__ method
+        Note that lmdb is not opened in __init__ method, to support multi-process.
+        Reference: https://github.com/pytorch/vision/issues/689
+        """
+        self.env = lmdb.open(self.file_folder,
+                             max_readers=32,
+                             readonly=True,
+                             readahead=False,
+                             meminit=False,
+                             lock=False)
+        self.txn = self.env.begin(buffers=True)
+
+    def __len__(self):
+        return self.num_images
+
+    def __getitem__(self, index):
+        if not hasattr(self, 'txn'):
+            self.open_lmdb()
+        key = self.keys[index]
+        image_bytes = self.txn.get(key)
+        image = read_image(image_bytes)
+        if self.transform is not None:
+            image = self.transform(image)
+        label = 0
+        return image, label
+
+
+def read_image(image_bytes):
+    """read image from bytes loaded from lmdb file
+    Args:
+        image_bytes: bytes, image data in bytes
+    Returns:
+        image: np.array, stores the image with shape [h, w, c]
+    """
+    image = Image.open(io.BytesIO(image_bytes))
+    image = np.array(image)
+    return image
+
+
+def save_image(image, name):
+    img = Image.fromarray(image)
+    img.save(f"{name}.png")
+
+
+def save_images(images, labels, out_path):
+    for idx, image in enumerate(images):
+        out_path = os.path.join(out_path, str(labels[idx]))
+        os.makedirs(out_path, exist_ok=True)
+        save_image(image, os.path.join(out_path, str(idx)))
+
+
+## NOTE: this is for test, can be removed later
+#if __name__ == "__main__":
+#    dataset = LSUNchurchDataset(file_folder='./church_outdoor_train_lmdb')
+#    for idx, (data, label) in enumerate(dataset):
+#        print(idx)
+#        print(data.shape)
+#        # save images to file
+#        save_image(data, f'lsun_{idx}')
+#        print('-----')
+#        if idx == 10:
+#            break
diff --git a/gan/Styleformer/main_multi_gpu.py b/gan/Styleformer/main_multi_gpu.py
new file mode 100644
index 00000000..2ab2aa3b
--- /dev/null
+++ b/gan/Styleformer/main_multi_gpu.py
@@ -0,0 +1,439 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Styleformer training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.distributed as dist
+from datasets import get_dataloader
+from datasets import get_dataset
+from generator import Generator
+from discriminator import StyleGANv2Discriminator
+from utils.utils import AverageMeter
+from utils.utils import WarmupCosineScheduler
+from utils.utils import gradient_penalty
+from utils.utils import all_gather
+from config import get_config
+from config import update_config
+from metrics.fid import FID
+
+parser = argparse.ArgumentParser('Styleformer')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+parser_args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, parser_args)
+
+
+config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+file_handler = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+file_handler.setFormatter(logging.Formatter(log_format))
+logger.addHandler(file_handler)
+logger.info(f'config= {config}')
+
+def train(dataloader,
+          gen,
+          dis,
+          z_dim,
+          gen_optimizer,
+          dis_optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        z_dim: int, input dimenstion of generator
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    gen.train()
+    train_loss_meter = AverageMeter()
+    time_st = time.time()
+    lambda_gp = 10
+    # fid = FID()
+    for batch_id, data in enumerate(dataloader):
+        dis_optimizer.clear_grad()
+        real_img = data[0]
+        batch_size = real_img.shape[0]
+
+        noise = paddle.randn([batch_size, z_dim])
+        fake_img = gen(noise, c=paddle.zeros([0]))
+        fake_img = (fake_img * 127.5 + 128).clip(0, 255).astype('uint8')
+        fake_img = fake_img / 255.0
+        fake_pred = dis(fake_img.detach())
+        real_pred = dis(real_img)
+
+        # fid.update(fake_img, real_img)
+        # fid_score = fid.accumulate()
+        # print(fake_pred[0],real_pred[0])
+        gp = gradient_penalty(dis, real_img, fake_img.detach())
+        d_loss = -(paddle.mean(real_pred) - paddle.mean(fake_pred)) + lambda_gp * gp
+
+        d_loss.backward()
+        dis_optimizer.step()
+
+        for _ in range(5):
+            gen_optimizer.clear_grad()
+            noise = paddle.randn([batch_size, z_dim])
+            gen_img = gen(noise, c=paddle.zeros([0]))
+            gen_img = (gen_img * 127.5 + 128).clip(0, 255).astype('uint8')
+            gen_img = gen_img / 255.0
+            #gen_imgs=paddle.multiply(gen_img,paddle.to_tensor(127.5))
+            #gen_imgs=paddle.clip(paddle.add(
+            #   gen_imgs,paddle.to_tensor(127.5)).transpose((0,2,3,1)),
+            #   min=0.0,max=255.0).astype('uint8')
+
+            fake_pred = dis(gen_img)
+            g_loss = -paddle.mean(fake_pred)
+
+            g_loss.backward()
+            gen_optimizer.step()
+
+        train_loss_meter.update(d_loss.numpy()[0] + g_loss.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"D Loss: {d_loss.item():.4f}, " +
+                f"G Loss: {g_loss.item():.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_time
+
+
+def r1_penalty(real_pred, real_img):
+    """
+    R1 regularization for discriminator. The core idea is to
+    penalize the gradient on real data alone: when the
+    generator distribution produces the true data distribution
+    and the discriminator is equal to 0 on the data manifold, the
+    gradient penalty ensures that the discriminator cannot create
+    a non-zero gradient orthogonal to the data manifold without
+    suffering a loss in the GAN game.
+    Ref:
+    Eq. 9 in Which training methods for GANs do actually converge.
+    """
+
+    grad_real = paddle.grad(outputs=real_pred.sum(),
+                            inputs=real_img,
+                            create_graph=True)[0]
+    grad_penalty = (grad_real * grad_real).reshape([grad_real.shape[0],
+                                                    -1]).sum(1).mean()
+    return grad_penalty
+
+
+def validate(dataloader,
+             model,
+             z_dim,
+             batch_size,
+             total_batch,
+             num_classes,
+             max_real_num=None,
+             max_gen_num=None,
+             debug_steps=32):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        z_dim: int, input dimenstion of generator
+        batch_size: int, batch size (used to init FID measturement)
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        fid_score: float, fid score
+        val_time: int, validation time in ms
+    """
+    model.eval()
+    time_st = time.time()
+    fid = FID(batch_size)
+    fid_preds_all = []
+    fid_gts_all = []
+    # similar to metric type: fid50k_full, fid50k, etc.
+    if max_real_num is not None:
+        max_real_batch = max_real_num // batch_size
+    else:
+        max_real_batch = total_batch
+    if max_gen_num is not None:
+        max_gen_batch = max_gen_num // batch_size
+    else:
+        max_gen_batch = total_batch
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            if batch_id >= max_real_batch:
+                break
+            curr_batch_size = data[0].shape[0]
+            fid.batch_size = curr_batch_size
+
+            real_image = data[0]
+            z = paddle.randn([curr_batch_size, z_dim])
+            fake_image = model(z, c=paddle.randint(0, num_classes, [curr_batch_size]))
+
+            fake_image = (fake_image * 127.5 + 128).clip(0, 255).astype('uint8')
+            fake_image = fake_image / 255.0
+
+            fid.update(fake_image, real_image)
+
+            # if exceed max num of gen, skip gather
+            if batch_id < max_gen_batch:
+                # gather all fid related data from other gpus
+                fid_preds_list = all_gather(fid.preds)
+                fid_preds = sum(fid_preds_list, [])
+                fid_preds_all.extend(fid_preds)
+
+            fid_gts_list = all_gather(fid.gts)
+            fid_gts = sum(fid_gts_list, [])
+            fid_gts_all.extend(fid_gts)
+
+            fid.reset()
+
+            if batch_id % debug_steps == 0:
+                if batch_id >= max_gen_batch:
+                    logger.info(f"Val Step[{batch_id:04d}/{total_batch:04d}] done (no gen)")
+                else:
+                    logger.info(f"Val Step[{batch_id:04d}/{total_batch:04d}] done")
+
+    fid.preds = fid_preds_all
+    fid.gts = fid_gts_all
+    fid_score = fid.accumulate()
+    val_time = time.time() - time_st
+    return fid_score, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = dist.get_world_size()
+    local_rank = dist.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    gen = Generator(config)
+    gen = paddle.DataParallel(gen)
+    # dis = Discriminator(c_dim=0,img_resolution=32,img_channels=3)
+    dis = StyleGANv2Discriminator(config)
+    dis = paddle.DataParallel(dis)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    # validation criterion (FID) is defined in validate method
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        gen_optimizer = paddle.optimizer.Momentum(
+            parameters=gen.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+        dis_optimizer = paddle.optimizer.Momentum(
+            parameters=dis.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "Adam":
+        gen_optimizer = paddle.optimizer.Adam(
+            parameters=gen.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            )
+        dis_optimizer = paddle.optimizer.Adam(
+            parameters=dis.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        gen.set_dict(model_state["gen_state_dict"])
+        dis.set_dict(model_state["dis_state_dict"])
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True
+        # load model weights
+        model_state = paddle.load(config.MODEL.RESUME + '.pdparams')
+        gen.set_dict(model_state["gen_state_dict"])
+        dis.set_dict(model_state["dis_state_dict"])
+        # load optimizer
+        opt_state = paddle.load(config.MODEL.RESUME + '.pdopt')
+        gen_optimizer.set_state_dict(opt_state["gen_state_dict"])
+        dis_optimizer.set_state_dict(opt_state["dis_state_dict"])
+        logger.info(f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        fid_score, val_time = validate(
+            dataloader=dataloader_val,
+            model=gen,
+            z_dim=config.MODEL.GEN.Z_DIM,
+            batch_size=config.DATA.BATCH_SIZE,
+            total_batch=total_batch_val,
+            num_classes=config.MODEL.NUM_CLASSES,
+            max_real_num=config.DATA.MAX_REAL_NUM // config.NGPUS if config.DATA.MAX_REAL_NUM else None,
+            max_gen_num=config.DATA.MAX_GEN_NUM // config.NGPUS if config.DATA.MAX_GEN_NUM else None,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f" ----- FID: {fid_score:.4f}, time: {val_time:.2f}")
+        return
+
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={gen_optimizer.get_lr():.6f}")
+        train_loss, train_time = train(dataloader=dataloader_train,
+                                       gen=gen,
+                                       dis=dis,
+                                       gen_optimizer=gen_optimizer,
+                                       dis_optimizer=dis_optimizer,
+                                       z_dim=config.MODEL.GEN.Z_DIM,
+                                       epoch=epoch,
+                                       total_batch=total_batch_train,
+                                       debug_steps=config.REPORT_FREQ)
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            fid_score, val_time = validate(
+                dataloader=dataloader_val,
+                model=gen,
+                z_dim=config.MODEL.GEN.Z_DIM,
+                batch_size=config.DATA.BATCH_SIZE,
+                total_batch=total_batch_val,
+                num_classes=config.MODEL.NUM_CLASSES,
+                max_real_num=config.DATA.MAX_REAL_NUM // config.NGPUS if config.DATA.MAX_REAL_NUM else None,
+                max_gen_num=config.DATA.MAX_GEN_NUM // config.NGPUS if config.DATA.MAX_GEN_NUM else None,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation FID: {fid_score:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save({"gen_state_dict":gen.state_dict(),
+                            "dis_state_dict":dis.state_dict()}, model_path + '.pdparams')
+                paddle.save({"gen_state_dict":gen_optimizer.state_dict(),
+                            "dis_state_dict":dis_optimizer.state_dict()}, model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gan/Styleformer/main_single_gpu.py b/gan/Styleformer/main_single_gpu.py
new file mode 100644
index 00000000..78426f14
--- /dev/null
+++ b/gan/Styleformer/main_single_gpu.py
@@ -0,0 +1,409 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Styleformer training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+from datasets import get_dataloader
+from datasets import get_dataset
+from generator import Generator
+from discriminator import StyleGANv2Discriminator
+from utils.utils import AverageMeter
+from utils.utils import WarmupCosineScheduler
+from utils.utils import gradient_penalty
+from config import get_config
+from config import update_config
+from metrics.fid import FID
+
+parser = argparse.ArgumentParser('Styleformer')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+file_handler = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+file_handler.setFormatter(logging.Formatter(log_format))
+logger.addHandler(file_handler)
+logger.info(f'config= {config}')
+
+def train(dataloader,
+          gen,
+          dis,
+          gen_optimizer,
+          dis_optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    gen.train()
+    train_loss_meter = AverageMeter()
+    time_st = time.time()
+    lambda_gp = 10
+    # fid = FID()
+    for batch_id, data in enumerate(dataloader):
+        dis_optimizer.clear_grad()
+        real_img = data[0]
+        batch_size = real_img.shape[0]
+
+        noise = paddle.randn([batch_size, gen.z_dim])
+        fake_img = gen(noise, c=paddle.zeros([0]))
+        fake_img = (fake_img * 127.5 + 128).clip(0, 255).astype('uint8')
+        fake_img = fake_img / 255.0
+        fake_pred = dis(fake_img.detach())
+        real_pred = dis(real_img)
+
+        # fid.update(fake_img, real_img)
+        # fid_score = fid.accumulate()
+        # print(fake_pred[0],real_pred[0])
+        gp = gradient_penalty(dis, real_img, fake_img.detach())
+        d_loss = -(paddle.mean(real_pred) - paddle.mean(fake_pred)) + lambda_gp * gp
+
+        d_loss.backward()
+        dis_optimizer.step()
+
+        for _ in range(5):
+            gen_optimizer.clear_grad()
+            noise = paddle.randn([batch_size, gen.z_dim])
+            gen_img = gen(noise, c=paddle.zeros([0]))
+            gen_img = (gen_img * 127.5 + 128).clip(0, 255).astype('uint8')
+            gen_img = gen_img / 255.0
+            #gen_imgs=paddle.multiply(gen_img,paddle.to_tensor(127.5))
+            #gen_imgs=paddle.clip(paddle.add(
+            #   gen_imgs,paddle.to_tensor(127.5)).transpose((0,2,3,1)),
+            #   min=0.0,max=255.0).astype('uint8')
+
+            fake_pred = dis(gen_img)
+            g_loss = -paddle.mean(fake_pred)
+
+            g_loss.backward()
+            gen_optimizer.step()
+
+        train_loss_meter.update(d_loss.numpy()[0] + g_loss.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"D Loss: {d_loss.item():.4f}, " +
+                f"G Loss: {g_loss.item():.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_time
+
+def r1_penalty(real_pred, real_img):
+    """
+    R1 regularization for discriminator. The core idea is to
+    penalize the gradient on real data alone: when the
+    generator distribution produces the true data distribution
+    and the discriminator is equal to 0 on the data manifold, the
+    gradient penalty ensures that the discriminator cannot create
+    a non-zero gradient orthogonal to the data manifold without
+    suffering a loss in the GAN game.
+    Ref:
+    Eq. 9 in Which training methods for GANs do actually converge.
+    """
+
+    grad_real = paddle.grad(outputs=real_pred.sum(),
+                            inputs=real_img,
+                            create_graph=True)[0]
+    grad_penalty = (grad_real * grad_real).reshape([grad_real.shape[0],
+                                                    -1]).sum(1).mean()
+    return grad_penalty
+
+
+def validate(dataloader,
+             model,
+             batch_size,
+             total_batch,
+             num_classes,
+             max_real_num=None,
+             max_gen_num=None,
+             debug_steps=32):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        batch_size: int, batch size (used to init FID measturement)
+        total_epoch: int, total num of epoch, for logging
+        max_real_num: int, max num of real images loaded from dataset
+        max_gen_num: int, max num of fake images genearted for validation
+        debug_steps: int, num of iters to log info
+    Returns:
+        fid_score: float, fid score
+        val_time: int, validation time in ms
+    """
+    model.eval()
+    time_st = time.time()
+    fid = FID(batch_size)
+    fid_preds_all = []
+    fid_gts_all = []
+    # similar to metric type: fid50k_full, fid50k, etc.
+    if max_real_num is not None:
+        max_real_batch = max_real_num // batch_size
+    else:
+        max_real_batch = total_batch
+    if max_gen_num is not None:
+        max_gen_batch = max_gen_num // batch_size
+    else:
+        max_gen_batch = total_batch
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            if batch_id >= max_real_batch:
+                break
+            curr_batch_size = data[0].shape[0]
+            fid.batch_size = curr_batch_size
+
+            real_image = data[0]
+            z = paddle.randn([curr_batch_size, model.z_dim])
+            fake_image = model(z, c=paddle.randint(0, num_classes, [curr_batch_size]))
+
+            fake_image = (fake_image * 127.5 + 128).clip(0, 255).astype('uint8')
+            fake_image = fake_image / 255.0
+
+            fid.update(fake_image, real_image)
+
+            if batch_id < max_gen_batch:
+                fid_preds_all.extend(fid.preds)
+            fid_gts_all.extend(fid.gts)
+            fid.reset()
+            if batch_id % debug_steps == 0:
+                if batch_id >= max_gen_batch:
+                    logger.info(f"Val Step[{batch_id:04d}/{total_batch:04d}] done (no gen)")
+                else:
+                    logger.info(f"Val Step[{batch_id:04d}/{total_batch:04d}] done")
+
+    fid.preds = fid_preds_all
+    fid.gts = fid_gts_all
+    fid_score = fid.accumulate()
+    val_time = time.time() - time_st
+    return fid_score, val_time
+
+
+def main():
+    """main function for training and validation"""
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+    # 1. Create model
+    gen = Generator(config)
+    dis = StyleGANv2Discriminator(config)
+
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+
+    # 3. Define criterion
+    # validation criterion (FID) is defined in validate method
+
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        gen_optimizer = paddle.optimizer.Momentum(
+            parameters=gen.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+        dis_optimizer = paddle.optimizer.Momentum(
+            parameters=dis.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "Adam":
+        gen_optimizer = paddle.optimizer.Adam(
+            parameters=gen.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            )
+        dis_optimizer = paddle.optimizer.Adam(
+            parameters=dis.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        gen.set_dict(model_state["gen_state_dict"])
+        dis.set_dict(model_state["dis_state_dict"])
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True
+        # load model weights
+        model_state = paddle.load(config.MODEL.RESUME + '.pdparams')
+        gen.set_dict(model_state["gen_state_dict"])
+        dis.set_dict(model_state["dis_state_dict"])
+        # load optimizer
+        opt_state = paddle.load(config.MODEL.RESUME + '.pdopt')
+        gen_optimizer.set_state_dict(opt_state["gen_state_dict"])
+        dis_optimizer.set_state_dict(opt_state["dis_state_dict"])
+        logger.info(f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        fid_score, val_time = validate(
+            dataloader=dataloader_val,
+            model=gen,
+            batch_size=config.DATA.BATCH_SIZE,
+            total_batch=len(dataloader_val),
+            num_classes=config.MODEL.NUM_CLASSES,
+            max_real_num=config.DATA.MAX_REAL_NUM,
+            max_gen_num=config.DATA.MAX_GEN_NUM,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f" ----- FID: {fid_score:.4f}, time: {val_time:.2f}")
+        return
+
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={gen_optimizer.get_lr():.6f}")
+        train_loss, train_time = train(dataloader=dataloader_train,
+                                       gen=gen,
+                                       dis=dis,
+                                       gen_optimizer=gen_optimizer,
+                                       dis_optimizer=dis_optimizer,
+                                       epoch=epoch,
+                                       total_batch=len(dataloader_train),
+                                       debug_steps=config.REPORT_FREQ)
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            fid_score, val_time = validate(
+                dataloader=dataloader_val,
+                model=gen,
+                batch_size=config.DATA.BATCH_SIZE,
+                total_batch=len(dataloader_val),
+                num_classes=config.MODEL.NUM_CLASSES,
+                max_real_num=config.DATA.MAX_REAL_NUM,
+                max_gen_num=config.DATA.MAX_GEN_NUM,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation FID: {fid_score:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save({"gen_state_dict":gen.state_dict(),
+                         "dis_state_dict":dis.state_dict()}, model_path + '.pdparams')
+            paddle.save({"gen_state_dict":gen_optimizer.state_dict(),
+                         "dis_state_dict":dis_optimizer.state_dict()}, model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+if __name__ == "__main__":
+    main()
diff --git a/gan/Styleformer/metrics/Registry.py b/gan/Styleformer/metrics/Registry.py
new file mode 100644
index 00000000..e1de1c66
--- /dev/null
+++ b/gan/Styleformer/metrics/Registry.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import traceback
+
+
+class Registry(object):
+    """
+    The registry that provides name -> object mapping, to support third-party users' custom modules.
+    To create a registry (inside ppgan):
+    .. code-block:: python
+        BACKBONE_REGISTRY = Registry('BACKBONE')
+    To register an object:
+    .. code-block:: python
+        @BACKBONE_REGISTRY.register()
+        class MyBackbone():
+            ...
+    Or:
+    .. code-block:: python
+        BACKBONE_REGISTRY.register(MyBackbone)
+    """
+    def __init__(self, name):
+        """
+        Args:
+            name (str): the name of this registry
+        """
+        self._name = name
+
+        self._obj_map = {}
+
+    def _do_register(self, name, obj):
+        assert (
+            name not in self._obj_map
+        ), "An object named '{}' was already registered in '{}' registry!".format(
+            name, self._name)
+        self._obj_map[name] = obj
+
+    def register(self, obj=None, name=None):
+        """
+        Register the given object under the the name `obj.__name__`.
+        Can be used as either a decorator or not. See docstring of this class for usage.
+        """
+        if obj is None:
+            # used as a decorator
+            def deco(func_or_class, name=name):
+                if name is None:
+                    name = func_or_class.__name__
+                self._do_register(name, func_or_class)
+                return func_or_class
+
+            return deco
+
+        # used as a function call
+        if name is None:
+            name = obj.__name__
+        self._do_register(name, obj)
+
+    def get(self, name):
+        ret = self._obj_map.get(name)
+        if ret is None:
+            raise KeyError(
+                "No object named '{}' found in '{}' registry!".format(
+                    name, self._name))
+
+        return ret
+
+
+def build_from_config(cfg, registry, default_args=None):
+    """Build a class from config dict.
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "name".
+        registry (ppgan.utils.Registry): The registry to search the name from.
+        default_args (dict, optional): Default initialization arguments.
+    Returns:
+        class: The constructed class.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'name' not in cfg:
+        if default_args is None or 'name' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "name", '
+                f'but got {cfg}\n{default_args}')
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be an ppgan.utils.Registry object, '
+                        f'but got {type(registry)}')
+    if not (isinstance(default_args, dict) or default_args is None):
+        raise TypeError('default_args must be a dict or None, '
+                        f'but got {type(default_args)}')
+
+    args = cfg.copy()
+
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+
+    cls_name = args.pop('name')
+    if isinstance(cls_name, str):
+        obj_cls = registry.get(cls_name)
+    elif inspect.isclass(cls_name):
+        obj_cls = obj_cls
+    else:
+        raise TypeError(
+            f'name must be a str or valid name, but got {type(cls_name)}')
+
+    try:
+        instance = obj_cls(**args)
+    except Exception as e:
+        stack_info = traceback.format_exc()
+        print("Fail to initial class [{}] with error: "
+              "{} and stack:\n{}".format(cls_name, e, str(stack_info)))
+        raise e
+    return instance
\ No newline at end of file
diff --git a/gan/Styleformer/metrics/__init__.py b/gan/Styleformer/metrics/__init__.py
new file mode 100644
index 00000000..08fd7121
--- /dev/null
+++ b/gan/Styleformer/metrics/__init__.py
@@ -0,0 +1,18 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .psnr_ssim import PSNR, SSIM
+from .fid import FID
+from .builder import build_metric
+from .Registry import Registry
diff --git a/gan/Styleformer/metrics/builder.py b/gan/Styleformer/metrics/builder.py
new file mode 100644
index 00000000..440ec3b2
--- /dev/null
+++ b/gan/Styleformer/metrics/builder.py
@@ -0,0 +1,27 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import paddle
+
+from .Registry import *
+ 
+METRICS = Registry("METRIC")
+
+
+def build_metric(cfg):
+    cfg_ = cfg.copy()
+    name = cfg_.pop('name', None)
+    metric = METRICS.get(name)(**cfg_)
+    return metric
diff --git a/gan/Styleformer/metrics/fid.py b/gan/Styleformer/metrics/fid.py
new file mode 100644
index 00000000..f6d83748
--- /dev/null
+++ b/gan/Styleformer/metrics/fid.py
@@ -0,0 +1,301 @@
+#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import fnmatch
+import numpy as np
+import cv2
+import paddle
+from PIL import Image
+from cv2 import imread
+from scipy import linalg
+from .inception import InceptionV3
+from paddle.utils.download import get_weights_path_from_url
+from .builder import METRICS
+
+try:
+    from tqdm import tqdm
+except:
+
+    def tqdm(x):
+        return x
+
+
+""" based on https://github.com/mit-han-lab/gan-compression/blob/master/metric/fid_score.py
+"""
+"""
+inceptionV3 pretrain model is convert from pytorch, pretrain_model url is https://paddle-gan-models.bj.bcebos.com/params_inceptionV3.tar.gz
+"""
+INCEPTIONV3_WEIGHT_URL = "https://paddlegan.bj.bcebos.com/InceptionV3.pdparams"
+
+@METRICS.register()
+class FID(paddle.metric.Metric):
+    def __init__(self, batch_size=1, use_GPU=True, dims = 2048, premodel_path=None, model=None):
+        self.batch_size = batch_size
+        self.use_GPU = use_GPU
+        self.dims = dims
+        self.premodel_path = premodel_path
+        if model is None:
+            block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+            model = InceptionV3([block_idx], normalize_input=False)
+        if premodel_path is None:
+            premodel_path = get_weights_path_from_url(INCEPTIONV3_WEIGHT_URL)
+        self.model = model
+        param_dict = paddle.load(premodel_path)
+        self.model.load_dict(param_dict)
+        self.model.eval()
+        self.reset()   
+        
+    def reset(self):
+        self.preds = []
+        self.gts = []
+        self.results = []
+
+    def update(self, preds, gts):
+        preds_inception, gts_inception = calculate_inception_val(
+            preds, gts, self.batch_size, self.model, self.use_GPU, self.dims)
+        self.preds.append(preds_inception)
+        self.gts.append(gts_inception)
+
+    def accumulate(self):
+        self.preds = np.concatenate(self.preds, axis=0)
+        self.gts = np.concatenate(self.gts, axis=0)
+        value = calculate_fid_given_img(self.preds, self.gts)
+        self.reset() 
+        return value
+
+    def name(self):
+        return 'FID'
+
+
+def _calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    m1 = np.atleast_1d(mu1)
+    m2 = np.atleast_1d(mu2)
+
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+
+    assert mu1.shape == mu2.shape, 'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, 'Training and test covariances have different dimensions'
+
+    diff = mu1 - mu2
+
+    t = sigma1.dot(sigma2)
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+               'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+
+    return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) -
+            2 * tr_covmean)
+
+
+def _get_activations_from_ims(img, model, batch_size, dims, use_gpu):
+    n_batches = (len(img) + batch_size - 1) // batch_size
+    n_used_img = len(img)
+    
+    pred_arr = np.empty((n_used_img, dims))
+    
+    for i in tqdm(range(n_batches)):
+        start = i * batch_size
+        end = start + batch_size
+        if end > len(img):
+            end = len(img)
+        images = img[start:end]
+        # if images.shape[1] != 3:
+        #     images = images.transpose((0, 3, 1, 2))
+        # images = paddle.to_tensor(images)
+        pred = model(images)[0][0]
+        pred_arr[start:end] = pred.reshape([end - start, -1]).cpu().numpy()
+    return pred_arr
+
+
+def _compute_statistic_of_img(act):
+    mu = np.mean(act, axis=0)
+    sigma = np.cov(act, rowvar=False)
+    return mu, sigma
+
+def calculate_inception_val(img_fake,
+                            img_real,
+                            batch_size,
+                            model,
+                            use_gpu = True,
+                            dims = 2048):
+    act_fake = _get_activations_from_ims(img_fake, model, batch_size, dims, use_gpu)
+    act_real = _get_activations_from_ims(img_real, model, batch_size, dims, use_gpu)
+    return act_fake, act_real
+
+def calculate_fid_given_img(act_fake, act_real):
+
+    m1, s1 = _compute_statistic_of_img(act_fake)
+    m2, s2 = _compute_statistic_of_img(act_real)
+    fid_value = _calculate_frechet_distance(m1, s1, m2, s2)
+    return fid_value
+
+
+def _get_activations(files,
+                     model,
+                     batch_size,
+                     dims,
+                     use_gpu,
+                     premodel_path,
+                     style=None):
+    if len(files) % batch_size != 0:
+        print(('Warning: number of images is not a multiple of the '
+               'batch size. Some samples are going to be ignored.'))
+    if batch_size > len(files):
+        print(('Warning: batch size is bigger than the datasets size. '
+               'Setting batch size to datasets size'))
+        batch_size = len(files)
+
+    n_batches = len(files) // batch_size
+    n_used_imgs = n_batches * batch_size
+
+    pred_arr = np.empty((n_used_imgs, dims))
+    for i in tqdm(range(n_batches)):
+        start = i * batch_size
+        end = start + batch_size
+
+        # same as stargan-v2 official implementation: resize to 256 first, then resize to 299
+        if style == 'stargan':
+            img_list = []
+            for f in files[start:end]:
+                im = Image.open(str(f)).convert('RGB')
+                if im.size[0] != 299:
+                    im = im.resize((256, 256), 2)
+                    im = im.resize((299, 299), 2)
+
+                img_list.append(np.array(im).astype('float32'))
+
+            images = np.array(img_list)
+        else:
+            images = np.array(
+                [imread(str(f)).astype(np.float32) for f in files[start:end]])
+
+        if len(images.shape) != 4:
+            images = imread(str(files[start]))
+            images = cv2.cvtColor(images, cv2.COLOR_BGR2GRAY)
+            images = np.array([images.astype(np.float32)])
+
+        images = images.transpose((0, 3, 1, 2))
+        images /= 255
+
+        # imagenet normalization
+        if style == 'stargan':
+            mean = np.array([0.485, 0.456, 0.406]).astype('float32')
+            std = np.array([0.229, 0.224, 0.225]).astype('float32')
+            images[:] = (images[:] - mean[:, None, None]) / std[:, None, None]
+
+        if style == 'stargan':
+            pred_arr[start:end] = inception_infer(images, premodel_path)
+        else:
+            with paddle.guard():
+                images = paddle.to_tensor(images)
+                param_dict, _ = paddle.load(premodel_path)
+                model.set_dict(param_dict)
+                model.eval()
+
+                pred = model(images)[0][0].numpy()
+
+                pred_arr[start:end] = pred.reshape(end - start, -1)
+
+    return pred_arr
+
+
+def inception_infer(x, model_path):
+    exe = paddle.static.Executor()
+    [inference_program, feed_target_names,
+     fetch_targets] = paddle.static.load_inference_model(model_path, exe)
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: x},
+                      fetch_list=fetch_targets)
+    return results[0]
+
+
+def _calculate_activation_statistics(files,
+                                     model,
+                                     premodel_path,
+                                     batch_size=50,
+                                     dims=2048,
+                                     use_gpu=False,
+                                     style=None):
+    act = _get_activations(files, model, batch_size, dims, use_gpu,
+                           premodel_path, style)
+    mu = np.mean(act, axis=0)
+    sigma = np.cov(act, rowvar=False)
+    return mu, sigma
+
+
+def _compute_statistics_of_path(path,
+                                model,
+                                batch_size,
+                                dims,
+                                use_gpu,
+                                premodel_path,
+                                style=None):
+    if path.endswith('.npz'):
+        f = np.load(path)
+        m, s = f['mu'][:], f['sigma'][:]
+        f.close()
+    else:
+        files = []
+        for root, dirnames, filenames in os.walk(path):
+            for filename in fnmatch.filter(
+                    filenames, '*.jpg') or fnmatch.filter(filenames, '*.png'):
+                files.append(os.path.join(root, filename))
+        m, s = _calculate_activation_statistics(files, model, premodel_path,
+                                                batch_size, dims, use_gpu,
+                                                style)
+    return m, s
+
+
+def calculate_fid_given_paths(paths,
+                              premodel_path,
+                              batch_size,
+                              use_gpu,
+                              dims,
+                              model=None,
+                              style=None):
+    assert os.path.exists(
+        premodel_path
+    ), 'pretrain_model path {} is not exists! Please download it first'.format(
+        premodel_path)
+    for p in paths:
+        if not os.path.exists(p):
+            raise RuntimeError('Invalid path: %s' % p)
+
+    if model is None and style != 'stargan':
+        with paddle.guard():
+            block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+            model = InceptionV3([block_idx], class_dim=1008)
+
+    m1, s1 = _compute_statistics_of_path(paths[0], model, batch_size, dims,
+                                         use_gpu, premodel_path, style)
+    m2, s2 = _compute_statistics_of_path(paths[1], model, batch_size, dims,
+                                         use_gpu, premodel_path, style)
+
+    fid_value = _calculate_frechet_distance(m1, s1, m2, s2)
+    return fid_value
+
diff --git a/gan/Styleformer/metrics/inception.py b/gan/Styleformer/metrics/inception.py
new file mode 100644
index 00000000..b98f2fc6
--- /dev/null
+++ b/gan/Styleformer/metrics/inception.py
@@ -0,0 +1,747 @@
+#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+from paddle.nn import Conv2D, AvgPool2D, MaxPool2D, BatchNorm, Linear, AdaptiveAvgPool2D
+
+__all__ = ['InceptionV3']
+
+
+class InceptionV3(nn.Layer):
+    DEFAULT_BLOCK_INDEX = 3
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,  # First max pooling features
+        192: 1,  # Second max pooling featurs
+        768: 2,  # Pre-aux classifier features
+        2048: 3  # Final average pooling features
+    }
+
+    def __init__(self,
+                 output_blocks=[DEFAULT_BLOCK_INDEX],
+                 class_dim=1000,
+                 aux_logits=False,
+                 resize_input=True,
+                 normalize_input=True):
+        super(InceptionV3, self).__init__()
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+        self.class_dim = class_dim
+        self.aux_logits = aux_logits
+
+        assert self.last_needed_block <= 3, 'Last possible output block index is 3'
+        self.blocks = []
+
+        self.Conv2d_1a_3x3 = ConvBNLayer(3,
+                                         32,
+                                         3,
+                                         stride=2,
+                                         name='Conv2d_1a_3x3')
+        self.Conv2d_2a_3x3 = ConvBNLayer(32, 32, 3, name='Conv2d_2a_3x3')
+        self.Conv2d_2b_3x3 = ConvBNLayer(32,
+                                         64,
+                                         3,
+                                         padding=1,
+                                         name='Conv2d_2b_3x3')
+        self.maxpool1 = MaxPool2D(kernel_size=3, stride=2)
+
+        block0 = [
+            self.Conv2d_1a_3x3, self.Conv2d_2a_3x3, self.Conv2d_2b_3x3,
+            self.maxpool1
+        ]
+        self.blocks.append(nn.Sequential(*block0))
+        ### block1
+
+        if self.last_needed_block >= 1:
+            self.Conv2d_3b_1x1 = ConvBNLayer(64, 80, 1, name='Conv2d_3b_1x1')
+            self.Conv2d_4a_3x3 = ConvBNLayer(80, 192, 3, name='Conv2d_4a_3x3')
+            self.maxpool2 = MaxPool2D(kernel_size=3, stride=2)
+            block1 = [self.Conv2d_3b_1x1, self.Conv2d_4a_3x3, self.maxpool2]
+            self.blocks.append(nn.Sequential(*block1))
+
+        ### block2
+        ### Mixed_5b 5c 5d
+        if self.last_needed_block >= 2:
+            self.Mixed_5b = Fid_inceptionA(192,
+                                           pool_features=32,
+                                           name='Mixed_5b')
+            self.Mixed_5c = Fid_inceptionA(256,
+                                           pool_features=64,
+                                           name='Mixed_5c')
+            self.Mixed_5d = Fid_inceptionA(288,
+                                           pool_features=64,
+                                           name='Mixed_5d')
+
+            ### Mixed_6
+            self.Mixed_6a = InceptionB(288, name='Mixed_6a')
+            self.Mixed_6b = Fid_inceptionC(768, c7=128, name='Mixed_6b')
+            self.Mixed_6c = Fid_inceptionC(768, c7=160, name='Mixed_6c')
+            self.Mixed_6d = Fid_inceptionC(768, c7=160, name='Mixed_6d')
+            self.Mixed_6e = Fid_inceptionC(768, c7=192, name='Mixed_6e')
+
+            block2 = [
+                self.Mixed_5b, self.Mixed_5c, self.Mixed_5d, self.Mixed_6a,
+                self.Mixed_6b, self.Mixed_6c, self.Mixed_6d, self.Mixed_6e
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+
+        if self.aux_logits:
+            self.AuxLogits = InceptionAux(768, self.class_dim, name='AuxLogits')
+        ### block3
+        ### Mixed_7
+        if self.last_needed_block >= 3:
+            self.Mixed_7a = InceptionD(768, name='Mixed_7a')
+            self.Mixed_7b = Fid_inceptionE_1(1280, name='Mixed_7b')
+            self.Mixed_7c = Fid_inceptionE_2(2048, name='Mixed_7c')
+            self.avgpool = AdaptiveAvgPool2D(output_size=1)
+
+            block3 = [self.Mixed_7a, self.Mixed_7b, self.Mixed_7c, self.avgpool]
+            self.blocks.append(nn.Sequential(*block3))
+
+    def forward(self, x):
+        out = []
+        aux = None
+        if self.resize_input:
+            x = nn.functional.interpolate(x, 
+                                          size=[299, 299], 
+                                          mode='bilinear', 
+                                          align_corners=False, 
+                                          align_mode=0)
+
+        if self.normalize_input:
+            x = x * 2 - 1
+
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if self.aux_logits and (idx == 2):
+                aux = self.AuxLogits(x)
+            if idx in self.output_blocks:
+                out.append(x)
+            if idx == self.last_needed_block:
+                break
+
+        return out, aux
+
+
+class InceptionA(nn.Layer):
+    def __init__(self, in_channels, pool_features, name=None):
+        super(InceptionA, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     64,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch5x5_1 = ConvBNLayer(in_channels,
+                                       48,
+                                       1,
+                                       name=name + '.branch5x5_1')
+        self.branch5x5_2 = ConvBNLayer(48,
+                                       64,
+                                       5,
+                                       padding=2,
+                                       name=name + '.branch5x5_2')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          64,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(64,
+                                          96,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3 = ConvBNLayer(96,
+                                          96,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_3')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       pool_features,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+        return paddle.concat(
+            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+
+
+class InceptionB(nn.Layer):
+    def __init__(self, in_channels, name=None):
+        super(InceptionB, self).__init__()
+        self.branch3x3 = ConvBNLayer(in_channels,
+                                     384,
+                                     3,
+                                     stride=2,
+                                     name=name + '.branch3x3')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          64,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(64,
+                                          96,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3 = ConvBNLayer(96,
+                                          96,
+                                          3,
+                                          stride=2,
+                                          name=name + '.branch3x3dbl_3')
+
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3(x)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool(x)
+        return paddle.concat([branch3x3, branch3x3dbl, branch_pool],
+                                   axis=1)
+
+
+class InceptionC(nn.Layer):
+    def __init__(self, in_channels, c7, name=None):
+        super(InceptionC, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     192,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch7x7_1 = ConvBNLayer(in_channels,
+                                       c7,
+                                       1,
+                                       name=name + '.branch7x7_1')
+        self.branch7x7_2 = ConvBNLayer(c7,
+                                       c7, (1, 7),
+                                       padding=(0, 3),
+                                       name=name + '.branch7x7_2')
+        self.branch7x7_3 = ConvBNLayer(c7,
+                                       192, (7, 1),
+                                       padding=(3, 0),
+                                       name=name + '.branch7x7_3')
+
+        self.branch7x7dbl_1 = ConvBNLayer(in_channels,
+                                          c7,
+                                          1,
+                                          name=name + '.branch7x7dbl_1')
+        self.branch7x7dbl_2 = ConvBNLayer(c7,
+                                          c7, (7, 1),
+                                          padding=(3, 0),
+                                          name=name + '.branch7x7dbl_2')
+        self.branch7x7dbl_3 = ConvBNLayer(c7,
+                                          c7, (1, 7),
+                                          padding=(0, 3),
+                                          name=name + '.branch7x7dbl_3')
+        self.branch7x7dbl_4 = ConvBNLayer(c7,
+                                          c7, (7, 1),
+                                          padding=(3, 0),
+                                          name=name + '.branch7x7dbl_4')
+        self.branch7x7dbl_5 = ConvBNLayer(c7,
+                                          192, (1, 7),
+                                          padding=(0, 3),
+                                          name=name + '.branch7x7dbl_5')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+
+        return paddle.concat(
+            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+
+
+class InceptionD(nn.Layer):
+    def __init__(self, in_channels, name=None):
+        super(InceptionD, self).__init__()
+        self.branch3x3_1 = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch3x3_1')
+        self.branch3x3_2 = ConvBNLayer(192,
+                                       320,
+                                       3,
+                                       stride=2,
+                                       name=name + '.branch3x3_2')
+
+        self.branch7x7x3_1 = ConvBNLayer(in_channels,
+                                         192,
+                                         1,
+                                         name=name + '.branch7x7x3_1')
+        self.branch7x7x3_2 = ConvBNLayer(192,
+                                         192, (1, 7),
+                                         padding=(0, 3),
+                                         name=name + '.branch7x7x3_2')
+        self.branch7x7x3_3 = ConvBNLayer(192,
+                                         192, (7, 1),
+                                         padding=(3, 0),
+                                         name=name + '.branch7x7x3_3')
+        self.branch7x7x3_4 = ConvBNLayer(192,
+                                         192,
+                                         3,
+                                         stride=2,
+                                         name=name + '.branch7x7x3_4')
+
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+
+        branch_pool = self.branch_pool(x)
+
+        return paddle.concat([branch3x3, branch7x7x3, branch_pool],
+                                   axis=1)
+
+
+class InceptionE(nn.Layer):
+    def __init__(self, in_channels, name=None):
+        super(InceptionE, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     320,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch3x3_1 = ConvBNLayer(in_channels,
+                                       384,
+                                       1,
+                                       name=name + '.branch3x3_1')
+        self.branch3x3_2a = ConvBNLayer(384,
+                                        384, (1, 3),
+                                        padding=(0, 1),
+                                        name=name + '.branch3x3_2a')
+        self.branch3x3_2b = ConvBNLayer(384,
+                                        384, (3, 1),
+                                        padding=(1, 0),
+                                        name=name + '.branch3x3_2b')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          448,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(448,
+                                          384,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3a = ConvBNLayer(384,
+                                           384, (1, 3),
+                                           padding=(0, 1),
+                                           name=name + '.branch3x3dbl_3a')
+        self.branch3x3dbl_3b = ConvBNLayer(384,
+                                           384, (3, 1),
+                                           padding=(1, 0),
+                                           name=name + '.branch3x3dbl_3b')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3_1 = self.branch3x3_1(x)
+        branch3x3_2a = self.branch3x3_2a(branch3x3_1)
+        branch3x3_2b = self.branch3x3_2b(branch3x3_1)
+        branch3x3 = paddle.concat([branch3x3_2a, branch3x3_2b], axis=1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl_3a = self.branch3x3dbl_3a(branch3x3dbl)
+        branch3x3dbl_3b = self.branch3x3dbl_3b(branch3x3dbl)
+        branch3x3dbl = paddle.concat([branch3x3dbl_3a, branch3x3dbl_3b],
+                                           axis=1)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+
+        return paddle.concat(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+
+
+class InceptionAux(nn.Layer):
+    def __init__(self, in_channels, num_classes, name=None):
+        super(InceptionAux, self).__init__()
+        self.num_classes = num_classes
+        self.pool0 = AvgPool2D(kernel_size=5, stride=3)
+        self.conv0 = ConvBNLayer(in_channels, 128, 1, name=name + '.conv0')
+        self.conv1 = ConvBNLayer(128, 768, 5, name=name + '.conv1')
+        self.pool1 = AvgPool2D(global_pooling=True)
+
+    def forward(self, x):
+        x = self.pool0(x)
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.pool1(x)
+        x = paddle.flatten(x, axis=1)
+        x = paddle.static.nn.fc(x, size=self.num_classes)
+        return x
+
+
+class Fid_inceptionA(nn.Layer):
+    """ FID block in inception v3
+    """
+    def __init__(self, in_channels, pool_features, name=None):
+        super(Fid_inceptionA, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     64,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch5x5_1 = ConvBNLayer(in_channels,
+                                       48,
+                                       1,
+                                       name=name + '.branch5x5_1')
+        self.branch5x5_2 = ConvBNLayer(48,
+                                       64,
+                                       5,
+                                       padding=2,
+                                       name=name + '.branch5x5_2')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          64,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(64,
+                                          96,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3 = ConvBNLayer(96,
+                                          96,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_3')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       pool_features,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+        return paddle.concat(
+            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+
+
+class Fid_inceptionC(nn.Layer):
+    """ FID block in inception v3
+    """
+    def __init__(self, in_channels, c7, name=None):
+        super(Fid_inceptionC, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     192,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch7x7_1 = ConvBNLayer(in_channels,
+                                       c7,
+                                       1,
+                                       name=name + '.branch7x7_1')
+        self.branch7x7_2 = ConvBNLayer(c7,
+                                       c7, (1, 7),
+                                       padding=(0, 3),
+                                       name=name + '.branch7x7_2')
+        self.branch7x7_3 = ConvBNLayer(c7,
+                                       192, (7, 1),
+                                       padding=(3, 0),
+                                       name=name + '.branch7x7_3')
+
+        self.branch7x7dbl_1 = ConvBNLayer(in_channels,
+                                          c7,
+                                          1,
+                                          name=name + '.branch7x7dbl_1')
+        self.branch7x7dbl_2 = ConvBNLayer(c7,
+                                          c7, (7, 1),
+                                          padding=(3, 0),
+                                          name=name + '.branch7x7dbl_2')
+        self.branch7x7dbl_3 = ConvBNLayer(c7,
+                                          c7, (1, 7),
+                                          padding=(0, 3),
+                                          name=name + '.branch7x7dbl_3')
+        self.branch7x7dbl_4 = ConvBNLayer(c7,
+                                          c7, (7, 1),
+                                          padding=(3, 0),
+                                          name=name + '.branch7x7dbl_4')
+        self.branch7x7dbl_5 = ConvBNLayer(c7,
+                                          192, (1, 7),
+                                          padding=(0, 3),
+                                          name=name + '.branch7x7dbl_5')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+
+        return paddle.concat(
+            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+
+
+class Fid_inceptionE_1(nn.Layer):
+    """ FID block in inception v3
+        """
+    def __init__(self, in_channels, name=None):
+        super(Fid_inceptionE_1, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     320,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch3x3_1 = ConvBNLayer(in_channels,
+                                       384,
+                                       1,
+                                       name=name + '.branch3x3_1')
+        self.branch3x3_2a = ConvBNLayer(384,
+                                        384, (1, 3),
+                                        padding=(0, 1),
+                                        name=name + '.branch3x3_2a')
+        self.branch3x3_2b = ConvBNLayer(384,
+                                        384, (3, 1),
+                                        padding=(1, 0),
+                                        name=name + '.branch3x3_2b')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          448,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(448,
+                                          384,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3a = ConvBNLayer(384,
+                                           384, (1, 3),
+                                           padding=(0, 1),
+                                           name=name + '.branch3x3dbl_3a')
+        self.branch3x3dbl_3b = ConvBNLayer(384,
+                                           384, (3, 1),
+                                           padding=(1, 0),
+                                           name=name + '.branch3x3dbl_3b')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3_1 = self.branch3x3_1(x)
+        branch3x3_2a = self.branch3x3_2a(branch3x3_1)
+        branch3x3_2b = self.branch3x3_2b(branch3x3_1)
+        branch3x3 = paddle.concat([branch3x3_2a, branch3x3_2b], axis=1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl_3a = self.branch3x3dbl_3a(branch3x3dbl)
+        branch3x3dbl_3b = self.branch3x3dbl_3b(branch3x3dbl)
+        branch3x3dbl = paddle.concat([branch3x3dbl_3a, branch3x3dbl_3b],
+                                           axis=1)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+
+        return paddle.concat(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+
+
+class Fid_inceptionE_2(nn.Layer):
+    """ FID block in inception v3
+    """
+    def __init__(self, in_channels, name=None):
+        super(Fid_inceptionE_2, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     320,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch3x3_1 = ConvBNLayer(in_channels,
+                                       384,
+                                       1,
+                                       name=name + '.branch3x3_1')
+        self.branch3x3_2a = ConvBNLayer(384,
+                                        384, (1, 3),
+                                        padding=(0, 1),
+                                        name=name + '.branch3x3_2a')
+        self.branch3x3_2b = ConvBNLayer(384,
+                                        384, (3, 1),
+                                        padding=(1, 0),
+                                        name=name + '.branch3x3_2b')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          448,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(448,
+                                          384,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3a = ConvBNLayer(384,
+                                           384, (1, 3),
+                                           padding=(0, 1),
+                                           name=name + '.branch3x3dbl_3a')
+        self.branch3x3dbl_3b = ConvBNLayer(384,
+                                           384, (3, 1),
+                                           padding=(1, 0),
+                                           name=name + '.branch3x3dbl_3b')
+        ### same with paper
+        self.branch_pool0 = MaxPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3_1 = self.branch3x3_1(x)
+        branch3x3_2a = self.branch3x3_2a(branch3x3_1)
+        branch3x3_2b = self.branch3x3_2b(branch3x3_1)
+        branch3x3 = paddle.concat([branch3x3_2a, branch3x3_2b], axis=1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl_3a = self.branch3x3dbl_3a(branch3x3dbl)
+        branch3x3dbl_3b = self.branch3x3dbl_3b(branch3x3dbl)
+        branch3x3dbl = paddle.concat([branch3x3dbl_3a, branch3x3dbl_3b],
+                                           axis=1)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+
+        return paddle.concat(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 act='relu',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.conv = Conv2D(in_channels=in_channels,
+                           out_channels=num_filters,
+                           kernel_size=filter_size,
+                           stride=stride,
+                           padding=padding,
+                           groups=groups,
+                           weight_attr=paddle.ParamAttr(name=name + ".conv.weight"),
+                           bias_attr=False)
+        self.bn = BatchNorm(num_filters,
+                            act=act,
+                            epsilon=0.001,
+                            param_attr=paddle.ParamAttr(name=name + ".bn.weight"),
+                            bias_attr=paddle.ParamAttr(name=name + ".bn.bias"),
+                            moving_mean_name=name + '.bn.running_mean',
+                            moving_variance_name=name + '.bn.running_var')
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        y = self.bn(y)
+        return y
diff --git a/gan/Styleformer/metrics/psnr_ssim.py b/gan/Styleformer/metrics/psnr_ssim.py
new file mode 100644
index 00000000..72702de0
--- /dev/null
+++ b/gan/Styleformer/metrics/psnr_ssim.py
@@ -0,0 +1,334 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import paddle
+
+from .builder import METRICS
+
+
+@METRICS.register()
+class PSNR(paddle.metric.Metric):
+    def __init__(self, crop_border, input_order='HWC', test_y_channel=False):
+        self.crop_border = crop_border
+        self.input_order = input_order
+        self.test_y_channel = test_y_channel
+        self.reset()
+
+    def reset(self):
+        self.results = []
+
+    def update(self, preds, gts):
+        if not isinstance(preds, (list, tuple)):
+            preds = [preds]
+
+        if not isinstance(gts, (list, tuple)):
+            gts = [gts]
+
+        for pred, gt in zip(preds, gts):
+            value = calculate_psnr(pred, gt, self.crop_border, self.input_order,
+                                   self.test_y_channel)
+            self.results.append(value)
+
+    def accumulate(self):
+        if paddle.distributed.get_world_size() > 1:
+            results = paddle.to_tensor(self.results)
+            results_list = []
+            paddle.distributed.all_gather(results_list, results)
+            self.results = paddle.concat(results_list).numpy()
+
+        if len(self.results) <= 0:
+            return 0.
+        return np.mean(self.results)
+
+    def name(self):
+        return 'PSNR'
+
+
+@METRICS.register()
+class SSIM(PSNR):
+    def update(self, preds, gts):
+        if not isinstance(preds, (list, tuple)):
+            preds = [preds]
+
+        if not isinstance(gts, (list, tuple)):
+            gts = [gts]
+
+        for pred, gt in zip(preds, gts):
+            value = calculate_ssim(pred, gt, self.crop_border, self.input_order,
+                                   self.test_y_channel)
+            self.results.append(value)
+
+    def name(self):
+        return 'SSIM'
+
+
+def calculate_psnr(img1,
+                   img2,
+                   crop_border,
+                   input_order='HWC',
+                   test_y_channel=False):
+    """Calculate PSNR (Peak Signal-to-Noise Ratio).
+
+    Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+
+    Args:
+        img1 (ndarray): Images with range [0, 255].
+        img2 (ndarray): Images with range [0, 255].
+        crop_border (int): Cropped pixels in each edge of an image. These
+            pixels are not involved in the PSNR calculation.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            Default: 'HWC'.
+        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
+
+    Returns:
+        float: psnr result.
+    """
+
+    assert img1.shape == img2.shape, (
+        f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f'Wrong input_order {input_order}. Supported input_orders are '
+            '"HWC" and "CHW"')
+    img1 = img1.copy().astype('float32')
+    img2 = img2.copy().astype('float32')
+    img1 = reorder_image(img1, input_order=input_order)
+    img2 = reorder_image(img2, input_order=input_order)
+
+    if crop_border != 0:
+        img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
+
+    if test_y_channel:
+        img1 = to_y_channel(img1)
+        img2 = to_y_channel(img2)
+
+    mse = np.mean((img1 - img2)**2)
+    if mse == 0:
+        return float('inf')
+    return 20. * np.log10(255. / np.sqrt(mse))
+
+
+def _ssim(img1, img2):
+    """Calculate SSIM (structural similarity) for one channel images.
+
+    It is called by func:`calculate_ssim`.
+
+    Args:
+        img1 (ndarray): Images with range [0, 255] with order 'HWC'.
+        img2 (ndarray): Images with range [0, 255] with order 'HWC'.
+
+    Returns:
+        float: ssim result.
+    """
+
+    C1 = (0.01 * 255)**2
+    C2 = (0.03 * 255)**2
+
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+    kernel = cv2.getGaussianKernel(11, 1.5)
+    window = np.outer(kernel, kernel.transpose())
+
+    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]
+    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
+    mu1_sq = mu1**2
+    mu2_sq = mu2**2
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
+    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
+    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
+
+    ssim_map = ((2 * mu1_mu2 + C1) *
+                (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
+                                       (sigma1_sq + sigma2_sq + C2))
+    return ssim_map.mean()
+
+
+def calculate_ssim(img1,
+                   img2,
+                   crop_border,
+                   input_order='HWC',
+                   test_y_channel=False):
+    """Calculate SSIM (structural similarity).
+
+    Ref:
+    Image quality assessment: From error visibility to structural similarity
+
+    The results are the same as that of the official released MATLAB code in
+    https://ece.uwaterloo.ca/~z70wang/research/ssim/.
+
+    For three-channel images, SSIM is calculated for each channel and then
+    averaged.
+
+    Args:
+        img1 (ndarray): Images with range [0, 255].
+        img2 (ndarray): Images with range [0, 255].
+        crop_border (int): Cropped pixels in each edge of an image. These
+            pixels are not involved in the SSIM calculation.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            Default: 'HWC'.
+        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
+
+    Returns:
+        float: ssim result.
+    """
+
+    assert img1.shape == img2.shape, (
+        f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f'Wrong input_order {input_order}. Supported input_orders are '
+            '"HWC" and "CHW"')
+
+    img1 = img1.copy().astype('float32')[..., ::-1]
+    img2 = img2.copy().astype('float32')[..., ::-1]
+
+    img1 = reorder_image(img1, input_order=input_order)
+    img2 = reorder_image(img2, input_order=input_order)
+
+    if crop_border != 0:
+        img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
+
+    if test_y_channel:
+        img1 = to_y_channel(img1)
+        img2 = to_y_channel(img2)
+
+    ssims = []
+    for i in range(img1.shape[2]):
+        ssims.append(_ssim(img1[..., i], img2[..., i]))
+    return np.array(ssims).mean()
+
+
+def reorder_image(img, input_order='HWC'):
+    """Reorder images to 'HWC' order.
+
+    If the input_order is (h, w), return (h, w, 1);
+    If the input_order is (c, h, w), return (h, w, c);
+    If the input_order is (h, w, c), return as it is.
+
+    Args:
+        img (ndarray): Input image.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            If the input image shape is (h, w), input_order will not have
+            effects. Default: 'HWC'.
+
+    Returns:
+        ndarray: reordered image.
+    """
+
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f'Wrong input_order {input_order}. Supported input_orders are '
+            "'HWC' and 'CHW'")
+    if len(img.shape) == 2:
+        img = img[..., None]
+        return img
+    if input_order == 'CHW':
+        img = img.transpose(1, 2, 0)
+    return img
+
+
+def bgr2ycbcr(img, y_only=False):
+    """Convert a BGR image to YCbCr image.
+
+    The bgr version of rgb2ycbcr.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+            and range as input image.
+    """
+    img_type = img.dtype
+
+    if y_only:
+        out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
+    return out_img
+
+
+def rgb2ycbcr(img, y_only=False):
+    """Convert a RGB image to YCbCr image.
+
+    The RGB version of rgb2ycbcr.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+            and range as input image.
+    """
+    img_type = img.dtype
+
+    if img_type != np.uint8:
+        img *= 255.
+
+    if y_only:
+        out_img = np.dot(img, [65.481, 128.553, 24.966]) / 255. + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
+
+    if img_type != np.uint8:
+        out_img /= 255.
+    else:
+        out_img = out_img.round()
+
+    return out_img
+
+
+def to_y_channel(img):
+    """Change to Y channel of YCbCr.
+
+    Args:
+        img (ndarray): Images with range [0, 255].
+
+    Returns:
+        (ndarray): Images with range [0, 255] (float type) without round.
+    """
+    img = img.astype(np.float32) / 255.
+    if img.ndim == 3 and img.shape[2] == 3:
+        img = rgb2ycbcr(img, y_only=True)
+        img = img[..., None]
+    return img * 255.
diff --git a/gan/Styleformer/port_weights/load_pytorch_weights_celeba.py b/gan/Styleformer/port_weights/load_pytorch_weights_celeba.py
new file mode 100644
index 00000000..9b36cf9b
--- /dev/null
+++ b/gan/Styleformer/port_weights/load_pytorch_weights_celeba.py
@@ -0,0 +1,215 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('./styleformer_pth')
+
+import argparse
+import os
+import numpy as np
+import paddle
+import torch
+import legacy
+import dnnlib
+from training.networks_Generator import *
+from generator import Generator as Generator_paddle
+from config import *
+
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/styleformer_celeba.yaml')
+parser.add_argument('-dataset', type=str, default="cifar10")
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-data_path', type=str, default='/dataset/celeba/')
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    sum=0
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+        sum=sum+1
+    print(sum)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    sum=0
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+        sum=sum+1
+    print(sum)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    resolution = config.MODEL.GEN.RESOLUTION
+    prefix = f'synthesis.b{resolution}_0'
+    mapping = [
+        (f'{prefix}.const', f'{prefix}.const'),
+    ]
+    num_layers = config.MODEL.GEN.NUM_LAYERS
+    # torch 'layers' to  paddle 'stages'
+    num_stages = len(num_layers)
+    linformer = config.MODEL.GEN.LINFORMER
+    i = 0
+    for i in range(num_stages):
+        stage_idx = 2**i * resolution
+        pp_s_prefix = f'synthesis.b{stage_idx}_'
+        th_s_prefix = f'synthesis.b{stage_idx}_'
+        mapping.extend([(f'{th_s_prefix}0.pos_embedding', f'{pp_s_prefix}0.pos_embedding')])
+
+        for block_idx in range(num_layers[i]):
+            th_b_prefix = f'{th_s_prefix}{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.enc.q_weight', f'{pp_b_prefix}.enc.q_weight'),
+                (f'{th_b_prefix}.enc.k_weight', f'{pp_b_prefix}.enc.k_weight'),
+                (f'{th_b_prefix}.enc.v_weight', f'{pp_b_prefix}.enc.v_weight'),
+                (f'{th_b_prefix}.enc.w_weight', f'{pp_b_prefix}.enc.w_weight'),
+                (f'{th_b_prefix}.enc.u_weight', f'{pp_b_prefix}.enc.u_weight'),
+                (f'{th_b_prefix}.enc.bias', f'{pp_b_prefix}.enc.bias'),
+                (f'{th_b_prefix}.enc.affine1.weight', f'{pp_b_prefix}.enc.affine1.weight'),
+                (f'{th_b_prefix}.enc.affine1.bias', f'{pp_b_prefix}.enc.affine1.bias'),
+                (f'{th_b_prefix}.resample_filter', f'{pp_b_prefix}.resample_filter'),
+                (f'{th_b_prefix}.enc.noise_const', f'{pp_b_prefix}.enc.noise_const'),
+                (f'{th_b_prefix}.enc.noise_strength', f'{pp_b_prefix}.enc.noise_strength'),
+            ]
+            if stage_idx>=32 and linformer:
+                mapping.extend([(f'{th_s_prefix}0.proj_weight', f'{pp_s_prefix}0.proj_weight')])
+            mapping.extend(layer_mapping)
+
+        mapping.extend([
+            (f'{th_b_prefix}.torgb.weight', f'{pp_b_prefix}.torgb.weight'),
+            (f'{th_b_prefix}.torgb.bias', f'{pp_b_prefix}.torgb.bias'),
+            (f'{th_b_prefix}.torgb.affine.weight', f'{pp_b_prefix}.torgb.affine.weight'),
+            (f'{th_b_prefix}.torgb.affine.bias', f'{pp_b_prefix}.torgb.affine.bias'),
+        ])
+        i = i + 1
+    mapping.extend([('mapping.fc0', 'mapping.fc0'),
+                    ('mapping.fc1', 'mapping.fc1'),
+                    ('mapping.w_avg', 'mapping.w_avg')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+        if value.shape == ():
+            value = value.reshape(1)
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, buff in paddle_model.named_buffers():
+        pd_params[name] = buff
+    for name, buff in torch_model.named_buffers():
+        th_params[name] = buff
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name, no_transpose=True)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = Generator_paddle(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    with dnnlib.util.open_url('./styleformer_pth_models/Pretrained_CelebA.pkl') as f:
+        torch_model = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(32, 512).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch, c=torch.zeros(1))
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle, c=paddle.zeros([1]))
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print('=======')
+    print(out_paddle[0, 0:20])
+    #assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+
+    # save weights for paddle model
+    model_path = os.path.join('./celeba.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gan/Styleformer/port_weights/load_pytorch_weights_cifar10.py b/gan/Styleformer/port_weights/load_pytorch_weights_cifar10.py
new file mode 100644
index 00000000..7bae33e2
--- /dev/null
+++ b/gan/Styleformer/port_weights/load_pytorch_weights_cifar10.py
@@ -0,0 +1,227 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('./styleformer_pth')
+
+import argparse
+import os
+import numpy as np
+import paddle
+import torch
+import legacy
+import dnnlib
+#from training.networks_Generator import *
+from generator import Generator as Generator_paddle
+from config import *
+
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/styleformer_cifar10.yaml')
+parser.add_argument('-dataset', type=str, default="cifar10")
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-data_path', type=str, default='/dataset/cifar10/')
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    sum=0
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+        sum=sum+1
+    print(sum)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    sum=0
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+        sum=sum+1
+    print(sum)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    resolution = config.MODEL.GEN.RESOLUTION
+    prefix = f'synthesis.b{resolution}_0'
+    mapping = [
+        (f'{prefix}.const', f'{prefix}.const'),
+    ]
+    num_layers = config.MODEL.GEN.NUM_LAYERS
+    # torch 'layers' to  paddle 'stages'
+    num_stages = len(num_layers)
+    linformer = config.MODEL.GEN.LINFORMER
+    i = 0
+    for i in range(num_stages):
+        stage_idx = 2**i * resolution
+        pp_s_prefix = f'synthesis.b{stage_idx}_'
+        th_s_prefix = f'synthesis.b{stage_idx}_'
+        mapping.extend([(f'{th_s_prefix}0.pos_embedding', f'{pp_s_prefix}0.pos_embedding')])
+
+        for block_idx in range(num_layers[i]):
+            th_b_prefix = f'{th_s_prefix}{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.enc.q_weight', f'{pp_b_prefix}.enc.q_weight'),
+                (f'{th_b_prefix}.enc.k_weight', f'{pp_b_prefix}.enc.k_weight'),
+                (f'{th_b_prefix}.enc.v_weight', f'{pp_b_prefix}.enc.v_weight'),
+                (f'{th_b_prefix}.enc.w_weight', f'{pp_b_prefix}.enc.w_weight'),
+                (f'{th_b_prefix}.enc.u_weight', f'{pp_b_prefix}.enc.u_weight'),
+                (f'{th_b_prefix}.enc.bias', f'{pp_b_prefix}.enc.bias'),
+                (f'{th_b_prefix}.enc.affine1.weight', f'{pp_b_prefix}.enc.affine1.weight'),
+                (f'{th_b_prefix}.enc.affine1.bias', f'{pp_b_prefix}.enc.affine1.bias'),
+                (f'{th_b_prefix}.resample_filter', f'{pp_b_prefix}.resample_filter'),
+                (f'{th_b_prefix}.enc.noise_const', f'{pp_b_prefix}.enc.noise_const'),
+                (f'{th_b_prefix}.enc.noise_strength', f'{pp_b_prefix}.enc.noise_strength'),
+            ]
+            if stage_idx>=32 and linformer:
+                mapping.extend([(f'{th_s_prefix}0.proj_weight', f'{pp_s_prefix}0.proj_weight')])
+            mapping.extend(layer_mapping)
+
+        mapping.extend([
+            (f'{th_b_prefix}.torgb.weight', f'{pp_b_prefix}.torgb.weight'),
+            (f'{th_b_prefix}.torgb.bias', f'{pp_b_prefix}.torgb.bias'),
+            (f'{th_b_prefix}.torgb.affine.weight', f'{pp_b_prefix}.torgb.affine.weight'),
+            (f'{th_b_prefix}.torgb.affine.bias', f'{pp_b_prefix}.torgb.affine.bias'),
+        ])
+        i = i + 1
+    mapping.extend([('mapping.fc0', 'mapping.fc0'),
+                    ('mapping.fc1', 'mapping.fc1'),
+                    ('mapping.w_avg', 'mapping.w_avg')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+        if value.shape == ():
+            value = value.reshape(1)
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, buff in paddle_model.named_buffers():
+        pd_params[name] = buff
+    for name, buff in torch_model.named_buffers():
+        th_params[name] = buff
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name, no_transpose=True)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = Generator_paddle(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    with dnnlib.util.open_url('./styleformer_pth_models/Pretrained_CIFAR10.pkl') as f:
+        torch_model = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore
+    torch_model.eval()
+
+    print('============================================')
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(32, 512).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch, c=torch.ones(32))
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle, c=paddle.ones([32]))
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    #print(out_torch[0, 0:20])
+    #print('=======')
+    #print(out_paddle[0, 0:20])
+    #assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+
+    for i in range(32):
+        print(i, np.allclose(out_torch[i], out_paddle[i], atol = 1e-1))
+        if not np.allclose(out_torch[i], out_paddle[i], atol = 1e-1):
+            print(out_torch[i])
+            print('xxxxxxxxxxxxx')
+            print(out_paddle[i])
+
+
+    # save weights for paddle model
+    model_path = os.path.join('./cifar10.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gan/Styleformer/port_weights/load_pytorch_weights_lsun_church.py b/gan/Styleformer/port_weights/load_pytorch_weights_lsun_church.py
new file mode 100644
index 00000000..6f166442
--- /dev/null
+++ b/gan/Styleformer/port_weights/load_pytorch_weights_lsun_church.py
@@ -0,0 +1,215 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('./styleformer_pth')
+
+import argparse
+import os
+import numpy as np
+import paddle
+import torch
+import legacy
+import dnnlib
+from training.networks_Generator import *
+from generator import Generator as Generator_paddle
+from config import *
+
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/styleformer_lsun.yaml')
+parser.add_argument('-dataset', type=str, default="cifar10")
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-data_path', type=str, default='/dataset/lsun_church/')
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    sum=0
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+        sum=sum+1
+    print(sum)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    sum=0
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+        sum=sum+1
+    print(sum)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    resolution = config.MODEL.GEN.RESOLUTION
+    prefix = f'synthesis.b{resolution}_0'
+    mapping = [
+        (f'{prefix}.const', f'{prefix}.const'),
+    ]
+    num_layers = config.MODEL.GEN.NUM_LAYERS
+    # torch 'layers' to  paddle 'stages'
+    num_stages = len(num_layers)
+    linformer = config.MODEL.GEN.LINFORMER
+    i = 0
+    for i in range(num_stages):
+        stage_idx = 2**i * resolution
+        pp_s_prefix = f'synthesis.b{stage_idx}_'
+        th_s_prefix = f'synthesis.b{stage_idx}_'
+        mapping.extend([(f'{th_s_prefix}0.pos_embedding', f'{pp_s_prefix}0.pos_embedding')])
+
+        for block_idx in range(num_layers[i]):
+            th_b_prefix = f'{th_s_prefix}{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.enc.q_weight', f'{pp_b_prefix}.enc.q_weight'),
+                (f'{th_b_prefix}.enc.k_weight', f'{pp_b_prefix}.enc.k_weight'),
+                (f'{th_b_prefix}.enc.v_weight', f'{pp_b_prefix}.enc.v_weight'),
+                (f'{th_b_prefix}.enc.w_weight', f'{pp_b_prefix}.enc.w_weight'),
+                (f'{th_b_prefix}.enc.u_weight', f'{pp_b_prefix}.enc.u_weight'),
+                (f'{th_b_prefix}.enc.bias', f'{pp_b_prefix}.enc.bias'),
+                (f'{th_b_prefix}.enc.affine1.weight', f'{pp_b_prefix}.enc.affine1.weight'),
+                (f'{th_b_prefix}.enc.affine1.bias', f'{pp_b_prefix}.enc.affine1.bias'),
+                (f'{th_b_prefix}.resample_filter', f'{pp_b_prefix}.resample_filter'),
+                (f'{th_b_prefix}.enc.noise_const', f'{pp_b_prefix}.enc.noise_const'),
+                (f'{th_b_prefix}.enc.noise_strength', f'{pp_b_prefix}.enc.noise_strength'),
+            ]
+            if stage_idx>=32 and linformer:
+                mapping.extend([(f'{th_s_prefix}0.proj_weight', f'{pp_s_prefix}0.proj_weight')])
+            mapping.extend(layer_mapping)
+
+        mapping.extend([
+            (f'{th_b_prefix}.torgb.weight', f'{pp_b_prefix}.torgb.weight'),
+            (f'{th_b_prefix}.torgb.bias', f'{pp_b_prefix}.torgb.bias'),
+            (f'{th_b_prefix}.torgb.affine.weight', f'{pp_b_prefix}.torgb.affine.weight'),
+            (f'{th_b_prefix}.torgb.affine.bias', f'{pp_b_prefix}.torgb.affine.bias'),
+        ])
+        i = i + 1
+    mapping.extend([('mapping.fc0', 'mapping.fc0'),
+                    ('mapping.fc1', 'mapping.fc1'),
+                    ('mapping.w_avg', 'mapping.w_avg')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+        if value.shape == ():
+            value = value.reshape(1)
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, buff in paddle_model.named_buffers():
+        pd_params[name] = buff
+    for name, buff in torch_model.named_buffers():
+        th_params[name] = buff
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name, no_transpose=True)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = Generator_paddle(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    with dnnlib.util.open_url('./styleformer_pth_models/Pretrained_LSUNchurch.pkl') as f:
+        torch_model = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(32, 512).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch, c=torch.zeros(1))
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle, c=paddle.zeros([1]))
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print('=======')
+    print(out_paddle[0, 0:20])
+    #assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+
+    # save weights for paddle model
+    model_path = os.path.join('./lsun.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gan/Styleformer/port_weights/load_pytorch_weights_stl10.py b/gan/Styleformer/port_weights/load_pytorch_weights_stl10.py
new file mode 100644
index 00000000..ecc227c2
--- /dev/null
+++ b/gan/Styleformer/port_weights/load_pytorch_weights_stl10.py
@@ -0,0 +1,213 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('./styleformer_pth')
+
+import argparse
+import os
+import numpy as np
+import paddle
+import torch
+import legacy
+import dnnlib
+from generator import Generator as Generator_paddle
+from config import *
+
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/styleformer_stl10.yaml')
+parser.add_argument('-dataset', type=str, default="stl10")
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-data_path', type=str, default='/dataset/stl10/')
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    sum=0
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+        sum=sum+1
+    print(sum)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    sum=0
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+        sum=sum+1
+    print(sum)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    resolution = config.MODEL.GEN.RESOLUTION
+    prefix = f'synthesis.b{resolution}_0'
+    mapping = [
+        (f'{prefix}.const', f'{prefix}.const'),
+    ]
+    num_layers = config.MODEL.GEN.NUM_LAYERS
+    # torch 'layers' to  paddle 'stages'
+    num_stages = len(num_layers)
+    linformer = config.MODEL.GEN.LINFORMER
+    i = 0
+    for i in range(num_stages):
+        stage_idx = 2**i * resolution
+        pp_s_prefix = f'synthesis.b{stage_idx}_'
+        th_s_prefix = f'synthesis.b{stage_idx}_'
+        mapping.extend([(f'{th_s_prefix}0.pos_embedding', f'{pp_s_prefix}0.pos_embedding')])
+
+        for block_idx in range(num_layers[i]):
+            th_b_prefix = f'{th_s_prefix}{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.enc.q_weight', f'{pp_b_prefix}.enc.q_weight'),
+                (f'{th_b_prefix}.enc.k_weight', f'{pp_b_prefix}.enc.k_weight'),
+                (f'{th_b_prefix}.enc.v_weight', f'{pp_b_prefix}.enc.v_weight'),
+                (f'{th_b_prefix}.enc.w_weight', f'{pp_b_prefix}.enc.w_weight'),
+                (f'{th_b_prefix}.enc.u_weight', f'{pp_b_prefix}.enc.u_weight'),
+                (f'{th_b_prefix}.enc.bias', f'{pp_b_prefix}.enc.bias'),
+                (f'{th_b_prefix}.enc.affine1.weight', f'{pp_b_prefix}.enc.affine1.weight'),
+                (f'{th_b_prefix}.enc.affine1.bias', f'{pp_b_prefix}.enc.affine1.bias'),
+                (f'{th_b_prefix}.resample_filter', f'{pp_b_prefix}.resample_filter'),
+                (f'{th_b_prefix}.enc.noise_const', f'{pp_b_prefix}.enc.noise_const'),
+                (f'{th_b_prefix}.enc.noise_strength', f'{pp_b_prefix}.enc.noise_strength'),
+            ]
+            if stage_idx>=32 and linformer:
+                mapping.extend([(f'{th_s_prefix}0.proj_weight', f'{pp_s_prefix}0.proj_weight')])
+            mapping.extend(layer_mapping)
+
+        mapping.extend([
+            (f'{th_b_prefix}.torgb.weight', f'{pp_b_prefix}.torgb.weight'),
+            (f'{th_b_prefix}.torgb.bias', f'{pp_b_prefix}.torgb.bias'),
+            (f'{th_b_prefix}.torgb.affine.weight', f'{pp_b_prefix}.torgb.affine.weight'),
+            (f'{th_b_prefix}.torgb.affine.bias', f'{pp_b_prefix}.torgb.affine.bias'),
+        ])
+        i = i + 1
+    mapping.extend([('mapping.fc0', 'mapping.fc0'),
+                    ('mapping.fc1', 'mapping.fc1'),
+                    ('mapping.w_avg', 'mapping.w_avg')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+        if value.shape == ():
+            value = value.reshape(1)
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, buff in paddle_model.named_buffers():
+        pd_params[name] = buff
+    for name, buff in torch_model.named_buffers():
+        th_params[name] = buff
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name, no_transpose=True)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = Generator_paddle(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    with dnnlib.util.open_url('./styleformer_pth_models/stl10_pretrained.pkl') as f:
+        torch_model = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(32, 512).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch, c=torch.zeros(32))
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle, c=paddle.zeros([32]))
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-1)
+
+    # save weights for paddle model
+    model_path = os.path.join('./stl10.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gan/Styleformer/run_eval.sh b/gan/Styleformer/run_eval.sh
new file mode 100644
index 00000000..ed7d75e5
--- /dev/null
+++ b/gan/Styleformer/run_eval.sh
@@ -0,0 +1,7 @@
+CUDA_VISIBLE_DEVICES=1 \
+python main_single_gpu.py \
+-cfg='./configs/styleformer_cifar10.yaml' \
+-dataset='cifar10' \
+-batch_size=64 \
+-eval \
+-pretrained='./cifar10'
diff --git a/gan/Styleformer/run_eval_celeba.sh b/gan/Styleformer/run_eval_celeba.sh
new file mode 100644
index 00000000..780e50c4
--- /dev/null
+++ b/gan/Styleformer/run_eval_celeba.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=2 \
+python main_single_gpu.py \
+-cfg='./configs/styleformer_celeba.yaml' \
+-dataset='celeba' \
+-batch_size=64 \
+-eval \
+-pretrained='./celeba' \
+-data_path='/workspace/gan_datasets/celeba/img_align_celeba'
diff --git a/gan/Styleformer/run_eval_lsun.sh b/gan/Styleformer/run_eval_lsun.sh
new file mode 100644
index 00000000..f0a37764
--- /dev/null
+++ b/gan/Styleformer/run_eval_lsun.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/styleformer_lsun.yaml' \
+-dataset='lsun' \
+-batch_size=128 \
+-eval \
+-pretrained='./lsun' \
+-data_path='/workspace/gan_datasets/church_outdoor_train_lmdb'
diff --git a/gan/Styleformer/run_eval_multi.sh b/gan/Styleformer/run_eval_multi.sh
new file mode 100644
index 00000000..901ef01a
--- /dev/null
+++ b/gan/Styleformer/run_eval_multi.sh
@@ -0,0 +1,7 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/styleformer_cifar10.yaml' \
+-dataset='cifar10' \
+-batch_size=64 \
+-eval \
+-pretrained='./cifar10'
diff --git a/gan/Styleformer/run_eval_multi_celeba.sh b/gan/Styleformer/run_eval_multi_celeba.sh
new file mode 100644
index 00000000..505499f9
--- /dev/null
+++ b/gan/Styleformer/run_eval_multi_celeba.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/styleformer_celeba.yaml' \
+-dataset='celeba' \
+-batch_size=128 \
+-eval \
+-pretrained='./celeba' \
+-data_path='/workspace/gan_datasets/celeba/img_align_celeba'
diff --git a/gan/Styleformer/run_eval_multi_lsun.sh b/gan/Styleformer/run_eval_multi_lsun.sh
new file mode 100644
index 00000000..5f479982
--- /dev/null
+++ b/gan/Styleformer/run_eval_multi_lsun.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/styleformer_lsun.yaml' \
+-dataset='lsun' \
+-batch_size=128 \
+-eval \
+-pretrained='./lsun' \
+-data_path='/workspace/gan_datasets/church_outdoor_train_lmdb'
diff --git a/gan/Styleformer/run_eval_multi_stl10.sh b/gan/Styleformer/run_eval_multi_stl10.sh
new file mode 100644
index 00000000..574be006
--- /dev/null
+++ b/gan/Styleformer/run_eval_multi_stl10.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/styleformer_stl10.yaml' \
+-dataset='stl10' \
+-batch_size=128 \
+-eval \
+-pretrained='./stl10' \
+-data_path='/workspace/gan_datasets/stl10_binary'
diff --git a/gan/Styleformer/run_eval_stl10.sh b/gan/Styleformer/run_eval_stl10.sh
new file mode 100644
index 00000000..d1ab6d1e
--- /dev/null
+++ b/gan/Styleformer/run_eval_stl10.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/styleformer_stl10.yaml' \
+-dataset='stl10' \
+-batch_size=128 \
+-eval \
+-pretrained='./stl10' \
+-data_path='/workspace/gan_datasets/stl10_binary'
diff --git a/gan/Styleformer/run_generate.sh b/gan/Styleformer/run_generate.sh
new file mode 100644
index 00000000..12d489af
--- /dev/null
+++ b/gan/Styleformer/run_generate.sh
@@ -0,0 +1,5 @@
+python generate.py \
+    -cfg='./configs/styleformer_cifar10.yaml' \
+    -num_out_images=16 \
+    -out_folder='./images_cifar10' \
+    -pretrained='./cifar10.pdparams'
diff --git a/gan/Styleformer/run_train.sh b/gan/Styleformer/run_train.sh
new file mode 100644
index 00000000..6f2639ca
--- /dev/null
+++ b/gan/Styleformer/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/styleformer_cifar10.yaml' \
+-dataset='cifar10' \
+-batch_size=32 \
+#-pretrained='./cifar10'
diff --git a/gan/Styleformer/run_train_multi.sh b/gan/Styleformer/run_train_multi.sh
new file mode 100644
index 00000000..dc17ffa6
--- /dev/null
+++ b/gan/Styleformer/run_train_multi.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/styleformer_cifar10.yaml' \
+-dataset='cifar10' \
+-batch_size=32 \
+#-pretrained='./cifar10'
diff --git a/gan/Styleformer/stl10_dataset.py b/gan/Styleformer/stl10_dataset.py
new file mode 100644
index 00000000..04828277
--- /dev/null
+++ b/gan/Styleformer/stl10_dataset.py
@@ -0,0 +1,123 @@
+ # Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+
+"""
+STL-10 Dataset and related methods
+"""
+import os
+import numpy as np
+from PIL import Image
+from paddle.io import Dataset
+
+
+class STL10Dataset(Dataset):
+    """paddle dataset for loading STL-10 binary data
+    This class will load the binary file from STL-10 dataset,
+    extract and read images and labels. Images are stored in numpy array,
+    with shape: [num_images, 96,96,3]. Labels are store in numpy array, with
+    shape: [num_images].
+
+    Args:
+        file_folder: str, folder path of STL-10 dataset binary files
+        mode: str, dataset mode, choose from ['train', 'test'], default: 'train'
+        transform: paddle.vision.transforms, transforms which is applied on data, default: None
+    """
+    def __init__(self, file_folder, mode='train', transform=None):
+        super().__init__()
+        assert mode in ['train', 'test', 'unlabeled']
+        self.folder = file_folder
+        self.transform = transform
+        self.height = 96
+        self.width = 96
+        self.channels = 3
+        self.mode = mode
+        # num of bytes of a single image
+        self.image_bytes = self.height * self.width * self.channels
+        self.train_filepath = os.path.join(file_folder, f'{mode}_X.bin')
+        self.images = read_all_images(self.train_filepath)
+
+        if mode != 'unlabeled':
+            self.label_filepath = os.path.join(file_folder, f'{mode}_y.bin')
+            self.labels = read_labels(self.label_filepath)
+        else:
+            self.labels = np.zeros(self.__len__())
+
+        print(f'----- STL-10 dataset {mode} len = {self.labels.shape[0]}')
+
+    def __len__(self):
+        return self.images.shape[0]
+
+    def __getitem__(self, index):
+        data = self.images[index]
+        if self.transform is not None:
+            data = self.transform(data)
+        label = self.labels[index]
+        return data, label
+
+
+def read_labels(label_path):
+    """read data labels from binary file
+    Args:
+        label_path: label binary file path, e.g.,'train_y.bin'
+    Returns:
+        labels: np.array, the label array with shape [num_images]
+    """
+    with open(label_path, 'rb') as infile:
+        labels = np.fromfile(infile, dtype=np.uint8)
+    return labels
+
+
+def read_all_images(data_path):
+    """read all images from binary file
+    Args:
+        data_path: data binary file path, e.g.,'train_X.bin'
+    Returns:
+        images: np.array, the image array with shape [num_images, 96, 96, 3]
+    """
+    with open(data_path, 'rb') as infile:
+        # read whole data in unit8
+        data = np.fromfile(infile, dtype=np.uint8)
+        # images are stored in column major order
+        # 1st, 2nd, 3rd 96x96 are red, green, blue channels
+        images = np.reshape(data, (-1, 3, 96, 96))
+        # outputs are with shape [num_images, height, width, channels]
+        images = np.transpose(images, (0, 3, 2, 1))
+        return images
+
+
+def save_image(image, name):
+    img = Image.fromarray(image)
+    img.save(f"{name}.png")
+
+
+def save_images(images, labels, out_path):
+    for idx, image in enumerate(images):
+        out_path = os.path.join(out_path, str(labels[idx]))
+        os.makedirs(out_path, exist_ok=True)
+        save_image(image, os.path.join(out_path, str(idx)+'.png'))
+
+
+## NOTE: this is for test, can be removed later
+#if __name__ == "__main__":
+#    dataset = STL10Dataset(file_folder='./stl10_binary')
+#    print(dataset.labels.shape)
+#    for idx, (data, label) in enumerate(dataset):
+#        print(idx)
+#        print(data.shape)
+#        # save images to file
+#        save_image(data, f'{idx}.png')
+#        print(label)
+#        print('-----')
+#        if idx == 10:
+#            break
diff --git a/gan/Styleformer/utils/equalized.py b/gan/Styleformer/utils/equalized.py
new file mode 100644
index 00000000..34cece95
--- /dev/null
+++ b/gan/Styleformer/utils/equalized.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .fused_act import fused_leaky_relu
+
+
+class EqualConv2D(nn.Layer):
+    """This convolutional layer class stabilizes the learning rate changes of its parameters.
+    Equalizing learning rate keeps the weights in the network at a similar scale during training.
+    """
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 bias=True):
+        super().__init__()
+
+        self.weight = self.create_parameter(
+            (out_channel, in_channel, kernel_size, kernel_size),
+            default_initializer=nn.initializer.Normal())
+        self.scale = 1 / math.sqrt(in_channel * (kernel_size * kernel_size))
+
+        self.stride = stride
+        self.padding = padding
+
+        if bias:
+            self.bias = self.create_parameter((out_channel, ),
+                                              nn.initializer.Constant(0.0))
+
+        else:
+            self.bias = None
+
+    def forward(self, input):
+        out = F.conv2d(
+            input,
+            self.weight * self.scale,
+            bias=self.bias,
+            stride=self.stride,
+            padding=self.padding,
+        )
+
+        return out
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},"
+            f" {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})"
+        )
+
+
+class EqualLinear(nn.Layer):
+    """This linear layer class stabilizes the learning rate changes of its parameters.
+    Equalizing learning rate keeps the weights in the network at a similar scale during training.
+    """
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 bias=True,
+                 bias_init=0,
+                 lr_mul=1,
+                 activation=None):
+        super().__init__()
+
+        self.weight = self.create_parameter(
+            (in_dim, out_dim), default_initializer=nn.initializer.Normal())
+        self.weight.set_value((self.weight / lr_mul))
+
+        if bias:
+            self.bias = self.create_parameter(
+                (out_dim, ), nn.initializer.Constant(bias_init))
+
+        else:
+            self.bias = None
+
+        self.activation = activation
+
+        self.scale = (1 / math.sqrt(in_dim)) * lr_mul
+        self.lr_mul = lr_mul
+
+    def forward(self, input):
+        if self.activation:
+            out = F.linear(input, self.weight * self.scale)
+            out = fused_leaky_relu(out, self.bias * self.lr_mul)
+
+        else:
+            out = F.linear(input,
+                           self.weight * self.scale,
+                           bias=self.bias * self.lr_mul)
+
+        return out
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}({self.weight.shape[0]}, {self.weight.shape[1]})"
+        )
\ No newline at end of file
diff --git a/gan/Styleformer/utils/fused_act.py b/gan/Styleformer/utils/fused_act.py
new file mode 100644
index 00000000..70a63902
--- /dev/null
+++ b/gan/Styleformer/utils/fused_act.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+ 
+ 
+class FusedLeakyReLU(nn.Layer):
+    def __init__(self, channel, bias=True, negative_slope=0.2, scale=2 ** 0.5):
+        super().__init__()
+ 
+        if bias:
+            self.bias = self.create_parameter((channel,), default_initializer=nn.initializer.Constant(0.0))
+ 
+        else:
+            self.bias = None
+ 
+        self.negative_slope = negative_slope
+        self.scale = scale
+ 
+    def forward(self, input):
+        return fused_leaky_relu(input, self.bias, self.negative_slope, self.scale)
+ 
+ 
+def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2 ** 0.5):
+    if bias is not None:
+        rest_dim = [1] * (len(input.shape) - len(bias.shape) - 1)
+        return (
+            F.leaky_relu(
+                input + bias.reshape((1, bias.shape[0], *rest_dim)), negative_slope=0.2
+            )
+            * scale
+        )
+ 
+    else:
+        return F.leaky_relu(input, negative_slope=0.2) * scale
\ No newline at end of file
diff --git a/gan/Styleformer/utils/upfirdn2d.py b/gan/Styleformer/utils/upfirdn2d.py
new file mode 100644
index 00000000..678292fb
--- /dev/null
+++ b/gan/Styleformer/utils/upfirdn2d.py
@@ -0,0 +1,194 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def setup_filter(f, normalize=True, flip_filter=False, gain=1, separable=None):
+    r"""Convenience function to setup 2D FIR filter for `upfirdn2d()`.
+    Args:
+        f:           Torch tensor, numpy array, or python list of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable),
+                     `[]` (impulse), or
+                     `None` (identity).
+        device:      Result device (default: cpu).
+        normalize:   Normalize the filter so that it retains the magnitude
+                     for constant input signal (DC)? (default: True).
+        flip_filter: Flip the filter? (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        separable:   Return a separable filter? (default: select automatically).
+    Returns:
+        Float32 tensor of the shape
+        `[filter_height, filter_width]` (non-separable) or
+        `[filter_taps]` (separable).
+    """
+    # Validate.
+    if f is None:
+        f = 1
+    f = paddle.to_tensor(f, dtype='float32')
+
+    if f.ndim == 0:
+        f = f[np.newaxis]
+
+    # Separable?
+    if separable is None:
+        separable = (f.ndim == 1 and f.numel() >= 8)
+    if f.ndim == 1 and not separable:
+        f = f.numpy()
+        f = numpy.outer(f, f)
+        f = paddle.to_tensor(f)
+
+    # Apply normalize, flip, gain, and device.
+    if normalize:
+        f /= f.sum()
+    if flip_filter:
+        f = f.flip(list(range(f.ndim)))
+    f = f * (gain ** (f.ndim / 2))
+    f = f
+    return f
+
+
+def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1,
+                     pad_y0, pad_y1):
+    _, channel, in_h, in_w = input.shape
+    input = input.reshape((-1, in_h, in_w, 1))
+
+    _, in_h, in_w, minor = input.shape
+    kernel_h, kernel_w = kernel.shape
+
+    out = input.reshape((-1, in_h, 1, in_w, 1, minor))
+    out = out.transpose((0, 1, 3, 5, 2, 4))
+    out = out.reshape((-1, 1, 1, 1))
+    out = F.pad(out, [0, up_x - 1, 0, up_y - 1])
+    out = out.reshape((-1, in_h, in_w, minor, up_y, up_x))
+    out = out.transpose((0, 3, 1, 4, 2, 5))
+    out = out.reshape((-1, minor, in_h * up_y, in_w * up_x))
+
+    out = F.pad(
+        out, [max(pad_x0, 0),
+              max(pad_x1, 0),
+              max(pad_y0, 0),
+              max(pad_y1, 0)])
+    out = out[:, :,
+              max(-pad_y0, 0):out.shape[2] - max(-pad_y1, 0),
+              max(-pad_x0, 0):out.shape[3] - max(-pad_x1, 0), ]
+
+    out = out.reshape(
+        ([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]))
+    w = paddle.flip(kernel, [0, 1]).reshape((1, 1, kernel_h, kernel_w))
+    out = F.conv2d(out, w)
+    out = out.reshape((
+        -1,
+        minor,
+        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
+    ))
+    out = out.transpose((0, 2, 3, 1))
+    out = out[:, ::down_y, ::down_x, :]
+
+    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+
+    return out.reshape((-1, channel, out_h, out_w))
+
+
+def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
+    out = upfirdn2d_native(input, kernel, up, up, down, down, pad[0], pad[1],
+                           pad[0], pad[1])
+
+    return out
+
+
+def make_kernel(k):
+    k = paddle.to_tensor(k, dtype='float32')
+
+    if k.ndim == 1:
+        k = k.unsqueeze(0) * k.unsqueeze(1)
+
+    k /= k.sum()
+
+    return k
+
+
+class Upfirdn2dUpsample(nn.Layer):
+    def __init__(self, kernel, factor=2):
+        super().__init__()
+
+        self.factor = factor
+        kernel = make_kernel(kernel) * (factor * factor)
+        self.register_buffer("kernel", kernel)
+
+        p = kernel.shape[0] - factor
+
+        pad0 = (p + 1) // 2 + factor - 1
+        pad1 = p // 2
+
+        self.pad = (pad0, pad1)
+
+    def forward(self, input):
+        out = upfirdn2d(input,
+                        self.kernel,
+                        up=self.factor,
+                        down=1,
+                        pad=self.pad)
+
+        return out
+
+
+class Upfirdn2dDownsample(nn.Layer):
+    def __init__(self, kernel, factor=2):
+        super().__init__()
+
+        self.factor = factor
+        kernel = make_kernel(kernel)
+        self.register_buffer("kernel", kernel)
+
+        p = kernel.shape[0] - factor
+
+        pad0 = (p + 1) // 2
+        pad1 = p // 2
+
+        self.pad = (pad0, pad1)
+
+    def forward(self, input):
+        out = upfirdn2d(input,
+                        self.kernel,
+                        up=1,
+                        down=self.factor,
+                        pad=self.pad)
+
+        return out
+
+
+class Upfirdn2dBlur(nn.Layer):
+    def __init__(self, kernel, pad, upsample_factor=1):
+        super().__init__()
+
+        kernel = make_kernel(kernel)
+
+        if upsample_factor > 1:
+            kernel = kernel * (upsample_factor * upsample_factor)
+
+        self.register_buffer("kernel", kernel, persistable=False)
+
+        self.pad = pad
+
+    def forward(self, input):
+        out = upfirdn2d(input, self.kernel, pad=self.pad)
+
+        return out
\ No newline at end of file
diff --git a/gan/Styleformer/utils/utils.py b/gan/Styleformer/utils/utils.py
new file mode 100644
index 00000000..0b64d2a9
--- /dev/null
+++ b/gan/Styleformer/utils/utils.py
@@ -0,0 +1,280 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+import pickle
+import random
+import numpy as np
+import paddle
+from paddle.optimizer.lr import LRScheduler
+import paddle.distributed as dist
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
+
+
+def all_gather(data):
+    """ run all_gather on any picklable data (do not requires tensors)
+    Args:
+        data: picklable object
+    Returns:
+        data_list: list of data gathered from each rank
+    """
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return [data]
+
+    buffer = pickle.dumps(data) #write data into Bytes and stores in buffer
+    np_buffer = np.frombuffer(buffer, dtype=np.int8)
+    tensor = paddle.to_tensor(np_buffer, dtype='int32') # uint8 doese not have many ops in paddle
+
+    # obtain Tensor size of each rank
+    local_size = paddle.to_tensor([tensor.shape[0]])
+    size_list = []
+    dist.all_gather(size_list, local_size)
+    max_size = max(size_list)
+
+    # receiving tensors from all ranks,
+    # all_gather does not support different shape, so we use padding
+    tensor_list = []
+    if local_size != max_size:
+        padding = paddle.empty(shape=(max_size - local_size, ), dtype='int32')
+        tensor = paddle.concat((tensor, padding), axis=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.astype('uint8').cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+# GP
+def gradient_penalty(discriminator, real, fake):
+    """gradient penalty"""
+    # BATCH_SIZE,C,H,W = real.shape
+    # the OP returns a random Tensor whose value is uniformly distributed
+    # within the range [min, max), with a shape of shape and a data type of dtype.
+    # epsilon ∼ U[0, 1].
+    epsilon = paddle.randn((real.shape[0], 1, 1, 1)).cuda()
+    # extend epsilon to real shape size
+    # x_hat = real * epsilon + fake * (1 - epsilon), Picture after interpolation
+    interpolated_images = paddle.to_tensor((real * epsilon + fake * (1 - epsilon)),
+                                           stop_gradient=False)
+    # the interpolated picture calculates the discriminator score
+    mixed_scores = discriminator(interpolated_images)
+    # print(mixed_scores)
+    # fake = paddle.to_tensor(paddle.ones((real.shape[0], 1)), stop_gradient=True).cuda()
+    fake = paddle.ones((real.shape[0], 1))
+    # calculate the blend gradient on the interpolated graph
+    # paddle.grad(outputs, inputs, grad_outputs=None, retain_graph=None, create_graph=False,
+    # only_inputs = True, allow_unused=False, no_grad_vars=None)
+    # for each input, calculate the gradient sum of all outputs relative to it
+    gradient = paddle.grad(
+        inputs=interpolated_images,
+        outputs=mixed_scores,
+        grad_outputs=fake,
+        create_graph=True,
+        retain_graph=True,
+        only_inputs=True)[0]
+    # try to make gradient points and flatten the gradient
+    gradient = paddle.reshape(gradient, (gradient.shape[0], -1))
+    # L2 norm
+    gradient_norm = gradient.norm(2, axis=1)
+    # calculate gradient_penalty
+    gp = paddle.mean((gradient_norm - 1) ** 2)
+    return gp
+
+def DiffAugment(x, policy='', channels_first=True, affine=None):
+    "method based on Revisiting unreasonable effectiveness of data in deep learning era"
+    if policy:
+        if not channels_first:
+            x = x.transpose(0, 3, 1, 2)
+        for p in policy.split(','):
+            for f in AUGMENT_FNS[p]:
+                x = f(x, affine=affine)
+        if not channels_first:
+            x = x.transpose(0, 2, 3, 1)
+    return x
+
+# belong to DiffAugment
+def rand_brightness(x, affine=None):
+    x = x + (paddle.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) - 0.5)
+    return x
+
+# belong to DiffAugment
+def rand_saturation(x, affine=None):
+    x_mean = x.mean(dim=1, keepdim=True)
+    x = (x - x_mean) * (paddle.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) * 2)
+    x = x + x_mean
+    return x
+
+# belong to DiffAugment
+def rand_contrast(x, affine=None):
+    x_mean = x.mean(dim=[1, 2, 3], keepdim=True)
+    x = (x - x_mean) * (paddle.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) + 0.5)
+    x = x + x_mean
+    return x
+
+# belong to DiffAugment
+def rand_cutout(x, ratio=0.5, affine=None):
+    if random.random() < 0.3:
+        cutout_size = int(x.size(2) * ratio + 0.5), int(x.size(3) * ratio + 0.5)
+        offset_x = paddle.randint(0,
+                                  x.size(2) + (1 - cutout_size[0] % 2), size=[x.size(0), 1, 1],
+                                  device=x.device)
+        offset_y = paddle.randint(0,
+                                  x.size(3) + (1 - cutout_size[1] % 2), size=[x.size(0), 1, 1],
+                                  device=x.device)
+        grid_batch, grid_x, grid_y = paddle.meshgrid(
+            paddle.arange(x.size(0), dtype=paddle.long, device=x.device),
+            paddle.arange(cutout_size[0], dtype=paddle.long, device=x.device),
+            paddle.arange(cutout_size[1], dtype=paddle.long, device=x.device),
+        )
+        grid_x = paddle.clamp(grid_x + offset_x - cutout_size[0] // 2, min=0, max=x.size(2) - 1)
+        grid_y = paddle.clamp(grid_y + offset_y - cutout_size[1] // 2, min=0, max=x.size(3) - 1)
+        del offset_x
+        del offset_y
+        mask = paddle.ones(x.size(0), x.size(2), x.size(3), dtype=x.dtype, device=x.device)
+        mask[grid_batch, grid_x, grid_y] = 0
+        x = x * mask.unsqueeze(1)
+        del mask
+        del grid_x
+        del grid_y
+        del grid_batch
+    return x
+
+# belong to DiffAugment
+def rand_translation(x, ratio=0.2, affine=None):
+    shift_x, shift_y = int(x.shape[2] * ratio + 0.5), int(x.shape[3] * ratio + 0.5)
+    translation_x = paddle.randint(-shift_x, shift_x + 1, shape=[x.shape[0], 1, 1])
+    translation_y = paddle.randint(-shift_y, shift_y + 1, shape=[x.shape[0], 1, 1])
+    grid_batch, grid_x, grid_y = paddle.meshgrid(
+        paddle.arange(x.shape[0]),
+        paddle.arange(x.shape[2]),
+        paddle.arange(x.shape[3]),
+    )
+    grid_x = paddle.clip(grid_x + translation_x + 1, 0, x.shape[2] + 1)
+    grid_y = paddle.clip(grid_y + translation_y + 1, 0, x.shape[3] + 1)
+    x_pad = paddle.nn.functional.pad(x, [1, 1, 1, 1, 0, 0, 0, 0])
+    x = x_pad.transpose([0, 2, 3, 1])[grid_batch, grid_x, grid_y].transpose([0, 3, 1, 2])
+    return x
+
+AUGMENT_FNS = {
+    'color': [rand_brightness, rand_saturation, rand_contrast],
+    'translation': [rand_translation],
+    'cutout': [rand_cutout],
+}
diff --git a/gan/transGAN/assets/TransGAN_1.png b/gan/transGAN/assets/TransGAN_1.png
new file mode 100644
index 00000000..f5fd89ea
Binary files /dev/null and b/gan/transGAN/assets/TransGAN_1.png differ
diff --git a/gan/transGAN/assets/cifar_9_2.jpg b/gan/transGAN/assets/cifar_9_2.jpg
new file mode 100644
index 00000000..3d1415f0
Binary files /dev/null and b/gan/transGAN/assets/cifar_9_2.jpg differ
diff --git a/gan/transGAN/celeba_dataset.py b/gan/transGAN/celeba_dataset.py
new file mode 100644
index 00000000..5065b02b
--- /dev/null
+++ b/gan/transGAN/celeba_dataset.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+CelebA Dataset related classes and methods
+Currently only support for GAN
+"""
+
+import os
+import glob
+from PIL import Image
+from paddle.io import Dataset
+
+class CelebADataset(Dataset):
+    """Build CelebA dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where align and cropped images are stored
+        transform: preprocessing ops to apply on image
+    """
+
+    def __init__(self, file_folder, transform=None):
+        """CelebA Dataset with dataset file path, and transform"""
+        super().__init__()
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = glob.glob(os.path.join(file_folder, '*.jpg'))
+        print(f'----- CelebA img_align len = {len(self.img_path_list)}')
+
+    def __len__(self):
+        return len(self.img_path_list)
+
+    def __getitem__(self, index):
+        img = Image.open(self.img_path_list[index]).convert('RGB')
+        if self.transform is not None:
+            img = self.transform(img)
+        label = 0
+        return img, label
+
+#if __name__ == "__main__":
+#    dataset = CelebADataset(file_folder='./celeba/img_align_celeba')
+#    for idx, (data, label) in enumerate(dataset):
+#        print(idx)
+#        print(data.size)
+#        print('-----')
+#        if idx == 10:
+#            break
diff --git a/gan/transGAN/config.py b/gan/transGAN/config.py
new file mode 100644
index 00000000..debfa0bf
--- /dev/null
+++ b/gan/transGAN/config.py
@@ -0,0 +1,161 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+"""
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 32 # train batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'cifar10' # dataset name
+_C.DATA.IMAGE_SIZE = 32 # input image size
+_C.DATA.CROP_PCT = 0.875 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads 
+_C.DATA.GEN_BATCH_SIZE = 128 # the batch size of gen
+_C.DATA.DIS_BATCH_SIZE = 64
+_C.DATA.NUM_EVAL_IMAGES = 2000 # when calculate fid, default is 20000
+_C.DATA.DIFF_AUG = "" # when train the dis_net, have to choose the aug method
+_C.DATA.BATCH_SIZE_EVAL = 32 # val batch_size for single GPU
+_C.DATA.MAX_GEN_NUM = None # max num of generate images for validation
+_C.DATA.MAX_REAL_NUM = None # max num of real images for validation
+
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'TransGAN'
+_C.MODEL.NAME = 'TransGAN'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 10
+_C.MODEL.DROPOUT = 0.1
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TYPE = "transGAN"
+_C.MODEL.GEN_MODEL = "ViT_custom"
+_C.MODEL.DIS_MODEL = "ViT_custom_scale2"
+_C.MODEL.PATCH_SIZE = 2
+_C.MODEL.LATENT_DIM = 256 # Hidden dim
+_C.MODEL.GF_DIM = 1024
+_C.MODEL.DF_DIM = 384
+_C.MODEL.BOTTOM_WIDTH = 8 # decide the DisBlock's window_size
+_C.MODEL.FAED_IN = 0.0
+_C.MODEL.D_DEPTH = 3 # the depth of DisBlock
+_C.MODEL.G_DEPTH = "5,4,2" # the depth of the Block in StageBlock
+_C.MODEL.G_NORM = "ln" # the norm in gen_net
+_C.MODEL.D_NORM = "ln" # the norm in dis_net
+_C.MODEL.G_ACT = "gelu" # the activation in gen_net
+_C.MODEL.D_ACT = "gelu" # the activation in dis_net
+_C.MODEL.G_MLP = 4 # decide the mlp_hidden_dim in MLP of gen_net, dim * mlp_ratio
+_C.MODEL.D_MLP = 4 # decide the mlp_hidden_dim in MLP of dis_net
+_C.MODEL.D_WINDOW_SIZE = 8 # calculate the relative_position_bias which adjust attn
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3
+_C.TRAIN.WEIGHT_DECAY = 0.05
+_C.TRAIN.BASE_LR = 0.001
+_C.TRAIN.WARMUP_START_LR = 1e-6
+_C.TRAIN.END_LR = 5e-4
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 10 # freq to save chpt
+_C.REPORT_FREQ = 100 # freq to logging info
+_C.VALIDATE_FREQ = 100 # freq to do validation
+_C.SEED = 20
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+_C.LATENT_NORM = False
+_C.LR_DECAY = False
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.MODEL.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/gan/transGAN/configs/transgan_cifar10.yaml b/gan/transGAN/configs/transgan_cifar10.yaml
new file mode 100644
index 00000000..1036eb63
--- /dev/null
+++ b/gan/transGAN/configs/transgan_cifar10.yaml
@@ -0,0 +1,13 @@
+DATA:
+    IMAGE_SIZE: 32
+    DATASET: 'cifar10'
+    MAX_GEN_NUM: 50000
+    MAX_REAL_NUM: None
+MODEL:
+    TYPE: TransGAN
+    NAME: TransGAN
+    NUM_CLASSES: 10
+    GEN_MODEL: "ViT_custom"
+    DIS_MODEL: "ViT_custom_scale2"
+
+
diff --git a/gan/transGAN/datasets.py b/gan/transGAN/datasets.py
new file mode 100644
index 00000000..dd70e8cc
--- /dev/null
+++ b/gan/transGAN/datasets.py
@@ -0,0 +1,178 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+def get_train_transforms(config):
+    """ Get training transforms
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        #transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+        #                             scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        #transforms.ToTensor(),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+    Returns the related dataset object according to configs and mode(train/val)
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+    assert mode in ['train', 'val', 'test']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            mode = 'val'
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+    Multi-GPU loader is implements as distributedBatchSampler.
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/gan/transGAN/generate.py b/gan/transGAN/generate.py
new file mode 100644
index 00000000..bfc9f01e
--- /dev/null
+++ b/gan/transGAN/generate.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate images using trained models"""
+import argparse
+import os
+from PIL import Image
+import paddle
+from models.ViT_custom import Generator
+from config import get_config
+from config import update_config
+
+def main():
+    """ generate sample images using pretrained model
+    The following args are required:
+        -cfg: str, path of yaml model config file
+        -pretrained: str, path of the pretrained model (ends with .pdparams)
+        -num_out_images: int, the num of output images to be saved in file
+        -out_folder: str, output folder path.
+    """
+    parser = argparse.ArgumentParser('transGAN')
+    parser.add_argument('-cfg', type=str, default=None)
+    parser.add_argument('-dataset', type=str, default=None)
+    parser.add_argument('-batch_size', type=int, default=None)
+    parser.add_argument('-image_size', type=int, default=None)
+    parser.add_argument('-data_path', type=str, default=None)
+    parser.add_argument('-ngpus', type=int, default=None)
+    parser.add_argument('-pretrained', type=str, default=None)
+    parser.add_argument('-resume', type=str, default=None)
+    parser.add_argument('-num_out_images', type=int, default=16)
+    parser.add_argument('-out_folder', type=str, default='./out_images_cifar10')
+    parser.add_argument('-eval', action='store_true')
+    parser.add_argument('-last_epoch', type=int, default=None)
+    args = parser.parse_args()
+
+    # get default config
+    config = get_config()
+    # update config by arguments
+    config = update_config(config, args)
+
+    # get model
+    print(f'----- Creating model...')
+    paddle_model = Generator(config)
+    paddle_model.eval()
+    # load model weights
+    print(f'----- Loading model form {config.MODEL.PRETRAINED}...')
+    model_state_dict = paddle.load(config.MODEL.PRETRAINED)
+    paddle_model.load_dict(model_state_dict['gen_state_dict'])
+    # get random input tensor
+    x_paddle = paddle.randn([args.num_out_images, config.MODEL.LATENT_DIM])
+    # inference
+    print(f'----- Inferencing...')
+    out_paddle = paddle_model(x_paddle, 0)
+    # post processing to obtain image
+    print('----- Postprocessing')
+    gen_imgs = (out_paddle * 127.5 + 128).clip(0,255).transpose((0, 2, 3, 1))
+    gen_imgs = gen_imgs.astype('uint8').cpu().numpy()
+    # save images to file
+    os.makedirs(args.out_folder, exist_ok=True)
+    print(f'----- Saving images to {args.out_folder}')
+    for i, gen_img in enumerate(gen_imgs):
+        img = Image.fromarray(gen_img, 'RGB')
+        out_path = os.path.join(args.out_folder, str(i) + '.png')
+        img.save(out_path)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/gan/transGAN/load_pytorch_weights_cifar10.py b/gan/transGAN/load_pytorch_weights_cifar10.py
new file mode 100644
index 00000000..c4353627
--- /dev/null
+++ b/gan/transGAN/load_pytorch_weights_cifar10.py
@@ -0,0 +1,373 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""load weight """
+
+import sys
+import argparse
+import json
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+import paddle
+import TransGAN.models_search as models_search
+from models.ViT_custom import Generator
+from models.ViT_custom_scale2 import Discriminator
+from config import get_config, update_config
+import matplotlib.pyplot as plt
+
+sys.path.append("../TransGAN")
+sys.path.append("..")
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+def torch_to_paddle_mapping():
+    py_prefix = 'module'
+    mapping = [
+        (f'{py_prefix}.pos_embed_1', 'pos_embed_1'),
+        (f'{py_prefix}.pos_embed_2', 'pos_embed_2'),
+        (f'{py_prefix}.pos_embed_3', 'pos_embed_3'),
+        (f'{py_prefix}.l1.weight', 'l1.weight'),
+        (f'{py_prefix}.l1.bias', 'l1.bias'),
+    ]
+
+    num_layers_1 = 5
+    for idx in range(num_layers_1):
+        ly_py_prefix = f'blocks.block.{idx}'
+        layer_mapping = [
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.weight', f'{ly_py_prefix}.norm1.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.bias', f'{ly_py_prefix}.norm1.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.relative_position_bias_table', f'{ly_py_prefix}.attn.relative_position_bias_table'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.qkv.weight', f'{ly_py_prefix}.attn.qkv.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.weight', f'{ly_py_prefix}.attn.proj.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.bias', f'{ly_py_prefix}.attn.proj.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.weight', f'{ly_py_prefix}.norm2.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.bias', f'{ly_py_prefix}.norm2.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.weight', f'{ly_py_prefix}.mlp.fc1.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.bias', f'{ly_py_prefix}.mlp.fc1.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.weight', f'{ly_py_prefix}.mlp.fc2.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.bias', f'{ly_py_prefix}.mlp.fc2.bias'),
+        ]
+        mapping.extend(layer_mapping)
+
+    num_layers_2 = 4
+    for idx in range(num_layers_2):
+        ly_py_prefix = f'upsample_blocks.0.block.{idx}'
+        layer_mapping = [
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.weight', f'{ly_py_prefix}.norm1.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.bias', f'{ly_py_prefix}.norm1.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.relative_position_bias_table', f'{ly_py_prefix}.attn.relative_position_bias_table'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.qkv.weight', f'{ly_py_prefix}.attn.qkv.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.weight', f'{ly_py_prefix}.attn.proj.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.bias', f'{ly_py_prefix}.attn.proj.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.weight', f'{ly_py_prefix}.norm2.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.bias', f'{ly_py_prefix}.norm2.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.weight', f'{ly_py_prefix}.mlp.fc1.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.bias', f'{ly_py_prefix}.mlp.fc1.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.weight', f'{ly_py_prefix}.mlp.fc2.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.bias', f'{ly_py_prefix}.mlp.fc2.bias'),
+        ]
+        mapping.extend(layer_mapping)
+
+    num_layers_3 = 2
+    for idx in range(num_layers_3):
+        ly_py_prefix = f'upsample_blocks.1.block.{idx}'
+        layer_mapping = [
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.weight', f'{ly_py_prefix}.norm1.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.bias', f'{ly_py_prefix}.norm1.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.relative_position_bias_table', f'{ly_py_prefix}.attn.relative_position_bias_table'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.qkv.weight', f'{ly_py_prefix}.attn.qkv.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.weight', f'{ly_py_prefix}.attn.proj.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.bias', f'{ly_py_prefix}.attn.proj.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.weight', f'{ly_py_prefix}.norm2.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.bias', f'{ly_py_prefix}.norm2.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.weight', f'{ly_py_prefix}.mlp.fc1.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.bias', f'{ly_py_prefix}.mlp.fc1.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.weight', f'{ly_py_prefix}.mlp.fc2.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.bias', f'{ly_py_prefix}.mlp.fc2.bias'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        (f'{py_prefix}.deconv.0.weight', 'deconv.0.weight'),
+        (f'{py_prefix}.deconv.0.bias', 'deconv.0.bias')
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+def torch_to_paddle_mapping_dis():
+    py_prefix = 'module'
+    mapping_all = []
+    mapping_dis = [
+        (f'{py_prefix}.cls_token', 'cls_token'),
+        (f'{py_prefix}.pos_embed_1', 'pos_embed_1'),
+        (f'{py_prefix}.pos_embed_2', 'pos_embed_2'),
+        (f'{py_prefix}.fRGB_1.weight', 'fRGB_1.weight'),
+        (f'{py_prefix}.fRGB_1.bias', 'fRGB_1.bias'),
+        (f'{py_prefix}.fRGB_2.weight', 'fRGB_2.weight'),
+        (f'{py_prefix}.fRGB_2.bias', 'fRGB_2.bias'),
+    ]
+
+    num_layers_1 = 3
+    for idx in range(num_layers_1):
+        ly_py_prefix = f'blocks_1.{idx}'
+        layer_mapping = [
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.weight', f'{ly_py_prefix}.norm1.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.bias', f'{ly_py_prefix}.norm1.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.noise_strength_1', f'{ly_py_prefix}.attn.noise_strength_1'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.relative_position_bias_table', f'{ly_py_prefix}.attn.relative_position_bias_table'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.qkv.weight', f'{ly_py_prefix}.attn.qkv.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.weight', f'{ly_py_prefix}.attn.proj.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.bias', f'{ly_py_prefix}.attn.proj.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.weight', f'{ly_py_prefix}.norm2.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.bias', f'{ly_py_prefix}.norm2.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.weight', f'{ly_py_prefix}.mlp.fc1.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.bias', f'{ly_py_prefix}.mlp.fc1.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.weight', f'{ly_py_prefix}.mlp.fc2.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.bias', f'{ly_py_prefix}.mlp.fc2.bias'),
+        ]
+        mapping_dis.extend(layer_mapping)
+
+    num_layers_2 = 3
+    for idx in range(num_layers_2):
+        ly_py_prefix = f'blocks_2.{idx}'
+        layer_mapping = [
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.weight', f'{ly_py_prefix}.norm1.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.bias', f'{ly_py_prefix}.norm1.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.noise_strength_1', f'{ly_py_prefix}.attn.noise_strength_1'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.relative_position_bias_table', f'{ly_py_prefix}.attn.relative_position_bias_table'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.qkv.weight', f'{ly_py_prefix}.attn.qkv.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.weight', f'{ly_py_prefix}.attn.proj.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.bias', f'{ly_py_prefix}.attn.proj.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.weight', f'{ly_py_prefix}.norm2.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.bias', f'{ly_py_prefix}.norm2.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.weight', f'{ly_py_prefix}.mlp.fc1.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.bias', f'{ly_py_prefix}.mlp.fc1.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.weight', f'{ly_py_prefix}.mlp.fc2.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.bias', f'{ly_py_prefix}.mlp.fc2.bias'),
+        ]
+        mapping_dis.extend(layer_mapping)
+    
+    num_layers_3 = 1
+    for idx in range(num_layers_3):
+        ly_py_prefix = f'last_block.{idx}'
+        layer_mapping = [
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.weight', f'{ly_py_prefix}.norm1.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm1.norm.bias', f'{ly_py_prefix}.norm1.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.noise_strength_1', f'{ly_py_prefix}.attn.noise_strength_1'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.qkv.weight', f'{ly_py_prefix}.attn.qkv.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.weight', f'{ly_py_prefix}.attn.proj.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.attn.proj.bias', f'{ly_py_prefix}.attn.proj.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.weight', f'{ly_py_prefix}.norm2.norm.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.norm2.norm.bias', f'{ly_py_prefix}.norm2.norm.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.weight', f'{ly_py_prefix}.mlp.fc1.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc1.bias', f'{ly_py_prefix}.mlp.fc1.bias'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.weight', f'{ly_py_prefix}.mlp.fc2.weight'),
+            (f'{py_prefix}.{ly_py_prefix}.mlp.fc2.bias', f'{ly_py_prefix}.mlp.fc2.bias'),
+        ]
+        mapping_dis.extend(layer_mapping)
+
+    head_mapping = [
+        (f'{py_prefix}.norm.norm.weight', 'norm.norm.weight'),
+        (f'{py_prefix}.norm.norm.bias', 'norm.norm.bias'),
+        (f'{py_prefix}.head.weight', 'head.weight'),
+        (f'{py_prefix}.head.bias', 'head.bias'),
+    ]
+    mapping_dis.extend(head_mapping)
+
+    return mapping_dis
+
+def convert(torch_model, paddle_model, mapping):
+    def _set_value(th_name, pd_name, no_transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+        if value.shape == ():
+            value = value.reshape(1)
+        if th_name.find("attn.proj.weight") != -1 and th_shape == pd_shape: # prevent shape[1]==shape[0]
+            value = value.transpose((1, 0))
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        if str(value.shape)[1:-2] == str(pd_params[pd_name].shape)[1:-2]:
+            pd_params[pd_name].set_value(value)
+        else:
+            pd_params[pd_name].set_value(value.T)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    if mapping == "gen":
+        mapping = torch_to_paddle_mapping()
+    else:
+        mapping = torch_to_paddle_mapping_dis()
+
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+def main():
+    parser = argparse.ArgumentParser('transGAN')
+    parser.add_argument('-cfg', type=str, default=None)
+    parser.add_argument('-dataset', type=str, default=None)
+    parser.add_argument('-batch_size', type=int, default=None)
+    parser.add_argument('-image_size', type=int, default=None)
+    parser.add_argument('-data_path', type=str, default=None)
+    parser.add_argument('-ngpus', type=int, default=None)
+    parser.add_argument('-pretrained', type=str, default=None)
+    parser.add_argument('-resume', type=str, default=None)
+    parser.add_argument('-last_epoch', type=int, default=None)
+    parser.add_argument('-eval', action='store_true')
+    args = parser.parse_args()
+
+    # get default config
+    config = get_config()
+    # update config by arguments
+    config = update_config(config, args)
+    config.freeze()
+
+    parser = argparse.ArgumentParser()
+    args_torch = parser.parse_args()
+    with open('../TransGAN/commandline_args.txt', 'r') as f:
+        args_torch.__dict__ = json.load(f)
+
+    paddle.set_device('cpu')
+    paddle_model_gen = Generator(args=config)
+    paddle_model_dis = Discriminator(args=config)
+    
+    paddle_model_gen.eval()
+    paddle_model_dis.eval()
+
+    print_model_named_params(paddle_model_gen)
+    print_model_named_buffers(paddle_model_gen)
+
+    print_model_named_params(paddle_model_dis)
+    print_model_named_buffers(paddle_model_dis)
+
+    device = torch.device('cpu')
+    torch_model_gen = eval('models_search.'+'ViT_custom_new'+'.Generator')(args=args_torch)
+    torch_model_gen = torch.nn.DataParallel(torch_model_gen.to("cuda:0"), device_ids=[0])
+
+    torch_model_dis = eval('models_search.'+'ViT_custom_scale2'+'.Discriminator')(args=args_torch)
+    torch_model_dis = torch.nn.DataParallel(torch_model_dis.to("cuda:0"), device_ids=[0])
+
+    print_model_named_params(torch_model_gen)
+    print_model_named_buffers(torch_model_gen)
+
+    print_model_named_params(torch_model_dis)
+    print_model_named_buffers(torch_model_dis)
+
+    checkpoint = torch.load("../cifar_checkpoint")
+    torch_model_gen.load_state_dict(checkpoint['avg_gen_state_dict'])
+    torch_model_dis.load_state_dict(checkpoint['dis_state_dict'])
+
+    torch_model_gen = torch_model_gen.to(device)
+    torch_model_gen.eval()
+    torch_model_dis = torch_model_dis.to(device)
+    torch_model_dis.eval()
+
+    # convert weights
+    paddle_model_gen = convert(torch_model_gen, paddle_model_gen, "gen")
+    paddle_model_dis = convert(torch_model_dis, paddle_model_dis, "dis")
+
+    # check correctness
+    x = np.random.normal(0, 1, (args_torch.eval_batch_size, args_torch.latent_dim))
+    z_paddle = paddle.to_tensor(x, dtype="float32")
+    z_torch = torch.cuda.FloatTensor(x)
+    epoch = 0
+    device_cor = torch.device('cuda')
+    torch_model_gen = torch_model_gen.to(device_cor)
+    torch_model_dis = torch_model_dis.to(device_cor)
+    gen_imgs_torch = torch_model_gen(z_torch, epoch)
+    fake_validity_torch = torch_model_dis(gen_imgs_torch)
+    gen_imgs_torch = gen_imgs_torch.mul_(127.5).add_(127.5).clamp_(0.0, 255.0)
+    gen_imgs_torch = gen_imgs_torch.permute(0, 2, 3, 1).to('cpu', torch.uint8).numpy()
+    plt.figure()
+    for i in range(1, 5):
+        plt.subplot(2, 2, i)
+        plt.imshow(gen_imgs_torch[i-1])
+        plt.xticks([])
+        plt.yticks([])
+    plt.draw()
+    plt.savefig(str("test_torch") + '.png')
+    print("gen_img_torch", gen_imgs_torch.flatten()[:10])
+    print("fake_validity_torch", fake_validity_torch.flatten()[:5])
+
+    model_state = paddle.load('./transgan_cifar10.pdparams')
+    paddle_model_gen.set_dict(model_state['gen_state_dict'])
+    paddle_model_dis.set_dict(model_state['dis_state_dict'])
+    gen_imgs_paddle = paddle_model_gen(z_paddle, epoch)
+    fake_validity_paddle = paddle_model_dis(gen_imgs_paddle)
+    gen_imgs_paddle = paddle.add(paddle.multiply(gen_imgs_paddle, paddle.to_tensor(127.5)), paddle.to_tensor(127.5))
+    gen_imgs_paddle = paddle.clip(gen_imgs_paddle.transpose((0, 2, 3, 1)), min=0.0, max=255.0).astype('uint8').cpu().numpy()
+    plt.figure()
+    for i in range(1, 5):
+        plt.subplot(2, 2, i)
+        plt.imshow(gen_imgs_paddle[i-1])
+        plt.xticks([])
+        plt.yticks([])
+    plt.draw()
+    plt.savefig(str("test_paddle") + '.png')
+    print("gen_imgs_paddle", gen_imgs_paddle.flatten()[:10])
+    print("fake_validity_paddle", fake_validity_paddle.flatten()[:5])
+    
+    #save weights for paddle model
+    model_path = os.path.join('./transgan_cifar10.pdparams')
+    paddle.save({
+        'gen_state_dict': paddle_model_gen.state_dict(),
+        'dis_state_dict': paddle_model_dis.state_dict(),
+    }, model_path)
+    print('all done')
+
+if __name__ == "__main__":
+    main()
diff --git a/gan/transGAN/lsun_church_dataset.py b/gan/transGAN/lsun_church_dataset.py
new file mode 100644
index 00000000..e106b391
--- /dev/null
+++ b/gan/transGAN/lsun_church_dataset.py
@@ -0,0 +1,124 @@
+ # Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+
+"""
+LSUN-church Dataset and related methods
+"""
+import os
+import io
+import numpy as np
+import lmdb
+from PIL import Image
+from paddle.io import Dataset
+
+
+class LSUNchurchDataset(Dataset):
+    """paddle dataset for loading LSUN-church binary data
+    This class will load the lmdb file from LSUN-church dataset,
+    extract and read images. Images are stored in list of numpy array
+
+    Args:
+        file_folder: str, folder path of LSUN-church dataset lmdb
+        mode: str, dataset mode, choose from ['train', 'val'], default: 'train'
+        transform: paddle.vision.transforms, transforms which is applied on data, default: None
+        max_num_images: int, num of images used in the dataset,
+        if None, use all the images, default: None
+    """
+    def __init__(self, file_folder, mode='train', transform=None, max_num_images=None):
+        super().__init__()
+        assert mode in ['train', 'val']
+        self.transform = transform
+        self.file_folder = file_folder
+        with lmdb.open(file_folder,
+                       map_size=1099511627776,
+                       max_readers=32,
+                       readonly=True,
+                       readahead=False,
+                       meminit=False,
+                       lock=False).begin(write=False) as txn:
+            self.num_images = txn.stat()['entries']
+            # efficient way of loading keys only
+            self.keys = list(txn.cursor().iternext(values=False))
+
+        self.txn = None
+        self.env = None
+
+        if max_num_images is not None:
+            self.num_images = min(self.num_images, max_num_images)
+
+        print(f'----- LSUN-church dataset {mode} len = {self.num_images}')
+
+    def open_lmdb(self):
+        """ Open lmdb, this method is called in __getitem__ method
+        Note that lmdb is not opened in __init__ method, to support multi-process.
+        Reference: https://github.com/pytorch/vision/issues/689
+        """
+        self.env = lmdb.open(self.file_folder,
+                             max_readers=32,
+                             readonly=True,
+                             readahead=False,
+                             meminit=False,
+                             lock=False)
+        self.txn = self.env.begin(buffers=True)
+
+    def __len__(self):
+        return self.num_images
+
+    def __getitem__(self, index):
+        if not hasattr(self, 'txn'):
+            self.open_lmdb()
+        key = self.keys[index]
+        image_bytes = self.txn.get(key)
+        image = read_image(image_bytes)
+        if self.transform is not None:
+            image = self.transform(image)
+        label = 0
+        return image, label
+
+
+def read_image(image_bytes):
+    """read image from bytes loaded from lmdb file
+    Args:
+        image_bytes: bytes, image data in bytes
+    Returns:
+        image: np.array, stores the image with shape [h, w, c]
+    """
+    image = Image.open(io.BytesIO(image_bytes))
+    image = np.array(image)
+    return image
+
+
+def save_image(image, name):
+    img = Image.fromarray(image)
+    img.save(f"{name}.png")
+
+
+def save_images(images, labels, out_path):
+    for idx, image in enumerate(images):
+        out_path = os.path.join(out_path, str(labels[idx]))
+        os.makedirs(out_path, exist_ok=True)
+        save_image(image, os.path.join(out_path, str(idx)))
+
+
+## NOTE: this is for test, can be removed later
+#if __name__ == "__main__":
+#    dataset = LSUNchurchDataset(file_folder='./church_outdoor_train_lmdb')
+#    for idx, (data, label) in enumerate(dataset):
+#        print(idx)
+#        print(data.shape)
+#        # save images to file
+#        save_image(data, f'lsun_{idx}')
+#        print('-----')
+#        if idx == 10:
+#            break
diff --git a/gan/transGAN/main_multi_gpu.py b/gan/transGAN/main_multi_gpu.py
new file mode 100644
index 00000000..7079cb19
--- /dev/null
+++ b/gan/transGAN/main_multi_gpu.py
@@ -0,0 +1,431 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""transGAN training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import matplotlib.pyplot as plt
+import paddle
+import paddle.distributed as dist
+from datasets import get_dataloader
+from datasets import get_dataset
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from utils import normal_
+from utils import constant_
+from utils import all_gather
+from config import get_config
+from config import update_config
+from metrics.fid import FID
+from models.ViT_custom import Generator
+from models.ViT_custom_scale2 import Discriminator
+
+
+parser = argparse.ArgumentParser('transGAN')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+parser_args = parser.parse_args()
+
+
+# log format
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, parser_args)
+
+
+config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+file_handler = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+file_handler.setFormatter(logging.Formatter(log_format))
+logger.addHandler(file_handler)
+logger.info(f'config= {config}')
+
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv2d') != -1:
+        # nn.init.xavier_uniform(m.weight.data, 1.)
+        normal_(m.weight, 0.0, 0.02)
+    elif classname.find('BatchNorm2d') != -1:
+        normal_(m.weight.data, 1.0, 0.02)
+        constant_(m.bias.data, 0.0)
+
+
+def validate(dataloader,
+             model,
+             batch_size,
+             total_batch,
+             num_classes,
+             max_real_num=None,
+             max_gen_num=None,
+             debug_steps=32):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a transGAN gen_net model
+        batch_size: int, batch size (used to init FID measturement)
+        total_batch: int, total num of epoch, for logging
+        max_real_num: int, max num of real images loaded from dataset
+        max_gen_num: int, max num of fake images genearted for validation
+        debug_steps: int, num of iters to log info
+    Returns:
+        fid_score: float, fid score
+        val_time: int, validation time in ms
+    """
+    model.eval()
+    time_st = time.time()
+    fid = FID(batch_size)
+    fid_preds_all = []
+    fid_gts_all = []
+    # similar to metric type: fid50k_full, fid50k, etc.
+    if max_real_num is not None:
+        max_real_batch = max_real_num // batch_size
+    else:
+        max_real_batch = total_batch
+    if max_gen_num is not None:
+        max_gen_batch = max_gen_num // batch_size
+    else:
+        max_gen_batch = total_batch
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            if batch_id >= max_real_batch:
+                break
+            curr_batch_size = data[0].shape[0]
+            fid.batch_size = curr_batch_size
+
+            real_image = data[0]
+            z_paddle = paddle.randn([curr_batch_size, config.MODEL.LATENT_DIM])
+
+            gen_imgs_paddle = model(z_paddle, 0)
+            gen_imgs_paddle = (gen_imgs_paddle * 127.5 + 128).clip(0, 255).astype('uint8')
+            gen_imgs_paddle = gen_imgs_paddle / 255.0
+
+            fid.update(gen_imgs_paddle, real_image)
+
+            if batch_id < max_gen_batch:
+                # gather all fid related data from other gpus
+                fid_preds_list = all_gather(fid.preds)
+                fid_preds = sum(fid_preds_list, [])
+                fid_preds_all.extend(fid_preds)
+
+            fid_gts_list = all_gather(fid.gts)
+            fid_gts = sum(fid_gts_list, [])
+            fid_gts_all.extend(fid_gts)
+
+            fid.reset()
+            if batch_id % debug_steps == 0:
+                if batch_id >= max_gen_batch:
+                    logger.info(f"Val Step[{batch_id:04d}/{total_batch:04d}] done (no gen)")
+                else:
+                    logger.info(f"Val Step[{batch_id:04d}/{total_batch:04d}] done")
+
+    fid.preds = fid_preds_all
+    fid.gts = fid_gts_all
+    fid_score = fid.accumulate()
+    val_time = time.time() - time_st
+    return fid_score, val_time
+
+
+def train(args,
+          gen_net,
+          dis_net,
+          gen_optimizer,
+          dis_optimizer,
+          lr_schedulers,
+          dataloader,
+          epoch,
+          total_batch,
+          debug_steps=2,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        args: the default set of net
+        gen_net: nn.Layer, the generator net
+        dis_net: nn.Layer, the discriminator net
+        gen_optimizer: generator's optimizer
+        dis_optimizer: discriminator's optimizer
+        dataloader: paddle.io.DataLoader, dataloader instance
+        lr_schedulers： learning rate
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_time
+    """
+    gen_net.train()
+    dis_net.train()
+    train_loss_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        # Sample noise as generator input
+        z = paddle.to_tensor(np.random.normal(0, 1, (image.shape[0], config.MODEL.LATENT_DIM)))
+
+        # ---------------------
+        #  Train Discriminator
+        # ---------------------
+        dis_optimizer.clear_grad()
+        real_validity = dis_net(image)
+        fake_imgs = gen_net(paddle.to_tensor(z, dtype="float32"), epoch).detach()
+        fake_validity = dis_net(fake_imgs)
+        d_loss = 0
+        d_loss = paddle.mean(nn.ReLU()(1.0 - real_validity)) + paddle.mean(nn.ReLU()(1 + fake_validity))
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        d_loss = d_loss / accum_iter
+        d_loss.backward()
+        dis_optimizer.step()
+        batch_size = image.shape[0]
+        train_loss_meter.update(d_loss.numpy()[0], batch_size)
+
+        # -----------------
+        #  Train Generator
+        # -----------------
+        if epoch % 2 == 0:
+            gen_optimizer.clear_grad()
+            z = np.random.normal(0, 1, (args.DATA.GEN_BATCH_SIZE, args.MODEL.LATENT_DIM))
+            gen_z = paddle.to_tensor(z, dtype="float32")
+            gen_imgs = gen_net(gen_z, epoch)
+            fake_validity = dis_net(gen_imgs)
+            # cal loss
+            g_loss = -paddle.mean(fake_validity)
+            g_loss.backward()
+            gen_optimizer.step()
+            batch_size = image.shape[0]
+            train_loss_meter.update(g_loss.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, ")
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = dist.get_world_size()
+    local_rank = dist.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    gen_net = Generator(args=config)
+    dis_net = Discriminator(args=config)
+    gen_net = paddle.DataParallel(gen_net)
+    dis_net = paddle.DataParallel(dis_net)
+
+    gen_net.apply(weights_init)
+    dis_net.apply(weights_init)
+
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    # training loss is defined in train method
+    # validation criterion (FID) is defined in validate method
+
+    # 4. Define lr_scheduler
+    gen_scheduler = None
+    dis_scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        gen_scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                              warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                              start_lr=config.TRAIN.BASE_LR,
+                                              end_lr=config.TRAIN.END_LR,
+                                              warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                              total_epochs=config.TRAIN.NUM_EPOCHS,
+                                              last_epoch=config.TRAIN.LAST_EPOCH,
+                                             )
+        dis_scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                              warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                              start_lr=config.TRAIN.BASE_LR,
+                                              end_lr=config.TRAIN.END_LR,
+                                              warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                              total_epochs=config.TRAIN.NUM_EPOCHS,
+                                              last_epoch=config.TRAIN.LAST_EPOCH,
+                                             )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        gen_scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                                 T_max=config.TRAIN.NUM_EPOCHS,
+                                                                 last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        gen_optimizer = paddle.optimizer.AdamW(
+            parameters=gen_net.parameters(),
+            learning_rate=gen_scheduler if gen_scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+        dis_optimizer = paddle.optimizer.AdamW(
+            parameters=dis_net.parameters(),
+            learning_rate=dis_scheduler if dis_scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        gen_net.set_dict(model_state["gen_state_dict"])
+        dis_net.set_dict(model_state["dis_state_dict"])
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True
+        # load model weights
+        model_state = paddle.load(config.MODEL.RESUME + '.pdparams')
+        gen_net.set_dict(model_state["gen_state_dict"])
+        dis_net.set_dict(model_state["dis_state_dict"])
+        # load optimizer
+        opt_state = paddle.load(config.MODEL.RESUME + '.pdopt')
+        gen_optimizer.set_state_dict(opt_state["gen_state_dict"])
+        dis_optimizer.set_state_dict(opt_state["dis_state_dict"])
+        logger.info(f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        fid_score, val_time = validate(
+            dataloader=dataloader_train, # using training set 
+            model=gen_net,
+            batch_size=config.DATA.BATCH_SIZE,
+            total_batch=total_batch_train, # using training set size
+            num_classes=config.MODEL.NUM_CLASSES,
+            max_real_num=config.DATA.MAX_REAL_NUM // config.NGPUS if config.DATA.MAX_REAL_NUM else None,
+            max_gen_num=config.DATA.MAX_GEN_NUM // config.NGPUS if config.DATA.MAX_GEN_NUM else None,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation fid_score: {fid_score:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        lr_schedulers = (gen_scheduler, dis_scheduler) if config.LR_DECAY else None
+        logging.info(f"Now training epoch {epoch}. gen LR={gen_optimizer.get_lr():.6f}")
+        logging.info(f"Now training epoch {epoch}. dis LR={dis_optimizer.get_lr():.6f}")
+        train_loss, train_time = train(config,
+                                       gen_net,
+                                       dis_net,
+                                       gen_optimizer,
+                                       dis_optimizer,
+                                       lr_schedulers,
+                                       dataloader=dataloader_train,
+                                       epoch=epoch,
+                                       total_batch=len(dataloader_train),
+                                       debug_steps=config.REPORT_FREQ,
+                                       accum_iter=config.TRAIN.ACCUM_ITER)
+        # lr_schedulers.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            fid_score, val_time = validate(
+                dataloader=dataloader_val,
+                model=gen_net,
+                batch_size=config.DATA.BATCH_SIZE,
+                total_batch=total_batch_val,
+                num_classes=config.MODEL.NUM_CLASSES,
+                max_real_num=config.DATA.MAX_REAL_NUM // config.NGPUS if config.DATA.MAX_REAL_NUM else None,
+                max_gen_num=config.DATA.MAX_GEN_NUM // config.NGPUS if config.DATA.MAX_GEN_NUM else None,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation fid_score: {fid_score:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save({"gen_state_dict":gen_net.state_dict(),
+                         "dis_state_dict":dis_net.state_dict()}, model_path + '.pdparams')
+            paddle.save({"gen_state_dict":gen_optimizer.state_dict(),
+                         "dis_state_dict":dis_optimizer.state_dict()}, model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gan/transGAN/main_single_gpu.py b/gan/transGAN/main_single_gpu.py
new file mode 100644
index 00000000..146a9b02
--- /dev/null
+++ b/gan/transGAN/main_single_gpu.py
@@ -0,0 +1,408 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""transGAN training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import matplotlib.pyplot as plt
+import paddle
+import paddle.nn as nn
+from datasets import get_dataloader
+from datasets import get_dataset
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from utils import normal_
+from utils import constant_
+from config import get_config
+from config import update_config
+from metrics.fid import FID
+from models.ViT_custom import Generator
+from models.ViT_custom_scale2 import Discriminator
+
+
+parser = argparse.ArgumentParser('transGAN')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+# log format
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv2d') != -1:
+        # nn.init.xavier_uniform(m.weight.data, 1.)
+        normal_(m.weight, 0.0, 0.02)
+    elif classname.find('BatchNorm2d') != -1:
+        normal_(m.weight.data, 1.0, 0.02)
+        constant_(m.bias.data, 0.0)
+
+
+def validate(dataloader,
+             model,
+             batch_size,
+             total_batch,
+             num_classes,
+             max_real_num=None,
+             max_gen_num=None,
+             debug_steps=32):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a transGAN gen_net model
+        batch_size: int, batch size (used to init FID measturement)
+        total_batch: int, total num of epoch, for logging
+        max_real_num: int, max num of real images loaded from dataset
+        max_gen_num: int, max num of fake images genearted for validation
+        debug_steps: int, num of iters to log info
+    Returns:
+        fid_score: float, fid score
+        val_time: int, validation time in ms
+    """
+    model.eval()
+    time_st = time.time()
+    fid = FID(batch_size)
+    fid_preds_all = []
+    fid_gts_all = []
+    # similar to metric type: fid50k_full, fid50k, etc.
+    if max_real_num is not None:
+        max_real_batch = max_real_num // batch_size
+    else:
+        max_real_batch = total_batch
+    if max_gen_num is not None:
+        max_gen_batch = max_gen_num // batch_size
+    else:
+        max_gen_batch = total_batch
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            if batch_id >= max_real_batch:
+                break
+            curr_batch_size = data[0].shape[0]
+            fid.batch_size = curr_batch_size
+
+            real_image = data[0]
+            z_paddle = paddle.randn([curr_batch_size, config.MODEL.LATENT_DIM])
+
+            gen_imgs_paddle = model(z_paddle, 0)
+            gen_imgs_paddle = (gen_imgs_paddle * 127.5 + 128).clip(0, 255).astype('uint8')
+            gen_imgs_paddle = gen_imgs_paddle / 255.0
+
+            fid.update(gen_imgs_paddle, real_image)
+
+            if batch_id < max_gen_batch:
+                fid_preds_all.extend(fid.preds)
+            fid_gts_all.extend(fid.gts)
+            fid.reset()
+            if batch_id % debug_steps == 0:
+                if batch_id >= max_gen_batch:
+                    logger.info(f"Val Step[{batch_id:04d}/{total_batch:04d}] done (no gen)")
+                else:
+                    logger.info(f"Val Step[{batch_id:04d}/{total_batch:04d}] done")
+
+    fid.preds = fid_preds_all
+    fid.gts = fid_gts_all
+    fid_score = fid.accumulate()
+    val_time = time.time() - time_st
+    return fid_score, val_time
+
+
+def train(args,
+          gen_net,
+          dis_net,
+          gen_optimizer,
+          dis_optimizer,
+          lr_schedulers,
+          dataloader,
+          epoch,
+          total_batch,
+          debug_steps=2,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        args: the default set of net
+        gen_net: nn.Layer, the generator net
+        dis_net: nn.Layer, the discriminator net
+        gen_optimizer: generator's optimizer
+        dis_optimizer: discriminator's optimizer
+        dataloader: paddle.io.DataLoader, dataloader instance
+        lr_schedulers： learning rate
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_time
+    """
+    gen_net.train()
+    dis_net.train()
+    train_loss_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        # Sample noise as generator input
+        z = paddle.to_tensor(np.random.normal(0, 1, (image.shape[0], config.MODEL.LATENT_DIM)))
+
+        # ---------------------
+        #  Train Discriminator
+        # ---------------------
+        dis_optimizer.clear_grad()
+        real_validity = dis_net(image)
+        fake_imgs = gen_net(paddle.to_tensor(z, dtype="float32"), epoch).detach()
+        fake_validity = dis_net(fake_imgs)
+        d_loss = 0
+        d_loss = paddle.mean(nn.ReLU()(1.0 - real_validity)) + paddle.mean(nn.ReLU()(1 + fake_validity))
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        d_loss = d_loss / accum_iter
+        d_loss.backward()
+        dis_optimizer.step()
+        batch_size = image.shape[0]
+        train_loss_meter.update(d_loss.numpy()[0], batch_size)
+
+        # -----------------
+        #  Train Generator
+        # -----------------
+        if epoch % 2 == 0:
+            gen_optimizer.clear_grad()
+            z = np.random.normal(0, 1, (args.DATA.GEN_BATCH_SIZE, args.MODEL.LATENT_DIM))
+            gen_z = paddle.to_tensor(z, dtype="float32")
+            gen_imgs = gen_net(gen_z, epoch)
+            fake_validity = dis_net(gen_imgs)
+            # cal loss
+            g_loss = -paddle.mean(fake_validity)
+            g_loss.backward()
+            gen_optimizer.step()
+            batch_size = image.shape[0]
+            train_loss_meter.update(g_loss.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, ")
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    gen_net = Generator(args=config)
+    dis_net = Discriminator(args=config)
+    gen_net = paddle.DataParallel(gen_net)
+    dis_net = paddle.DataParallel(dis_net)
+
+    gen_net.apply(weights_init)
+    dis_net.apply(weights_init)
+
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='test')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', False)
+
+    # 3. Define criterion
+    # training loss is defined in train method
+    # validation criterion (FID) is defined in validate method
+
+    # 4. Define lr_scheduler
+    gen_scheduler = None
+    dis_scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        gen_scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                              warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                              start_lr=config.TRAIN.BASE_LR,
+                                              end_lr=config.TRAIN.END_LR,
+                                              warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                              total_epochs=config.TRAIN.NUM_EPOCHS,
+                                              last_epoch=config.TRAIN.LAST_EPOCH,
+                                             )
+        dis_scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                              warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                              start_lr=config.TRAIN.BASE_LR,
+                                              end_lr=config.TRAIN.END_LR,
+                                              warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                              total_epochs=config.TRAIN.NUM_EPOCHS,
+                                              last_epoch=config.TRAIN.LAST_EPOCH,
+                                             )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        gen_scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                                 T_max=config.TRAIN.NUM_EPOCHS,
+                                                                 last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        gen_optimizer = paddle.optimizer.AdamW(
+            parameters=gen_net.parameters(),
+            learning_rate=gen_scheduler if gen_scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+        dis_optimizer = paddle.optimizer.AdamW(
+            parameters=dis_net.parameters(),
+            learning_rate=dis_scheduler if dis_scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        gen_net.set_dict(model_state["gen_state_dict"])
+        dis_net.set_dict(model_state["dis_state_dict"])
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True
+        # load model weights
+        model_state = paddle.load(config.MODEL.RESUME + '.pdparams')
+        gen_net.set_dict(model_state["gen_state_dict"])
+        dis_net.set_dict(model_state["dis_state_dict"])
+        # load optimizer
+        opt_state = paddle.load(config.MODEL.RESUME + '.pdopt')
+        gen_optimizer.set_state_dict(opt_state["gen_state_dict"])
+        dis_optimizer.set_state_dict(opt_state["dis_state_dict"])
+        logger.info(f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        fid_score, val_time = validate(
+            dataloader=dataloader_train, # using training set
+            model=gen_net,
+            batch_size=config.DATA.BATCH_SIZE,
+            total_batch=len(dataloader_train), # using training set
+            num_classes=config.MODEL.NUM_CLASSES,
+            max_real_num=config.DATA.MAX_REAL_NUM,
+            max_gen_num=config.DATA.MAX_GEN_NUM,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation fid_score: {fid_score:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        lr_schedulers = (gen_scheduler, dis_scheduler) if config.LR_DECAY else None
+        logging.info(f"Now training epoch {epoch}. gen LR={gen_optimizer.get_lr():.6f}")
+        logging.info(f"Now training epoch {epoch}. dis LR={dis_optimizer.get_lr():.6f}")
+        train_loss, train_time = train(config,
+                                       gen_net,
+                                       dis_net,
+                                       gen_optimizer,
+                                       dis_optimizer,
+                                       lr_schedulers,
+                                       dataloader=dataloader_train,
+                                       epoch=epoch,
+                                       total_batch=len(dataloader_train),
+                                       debug_steps=config.REPORT_FREQ,
+                                       accum_iter=config.TRAIN.ACCUM_ITER,
+                                      )
+        # lr_schedulers.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            fid_score, val_time = validate(
+                dataloader=dataloader_val,
+                model=gen_net,
+                batch_size=config.DATA.BATCH_SIZE,
+                total_batch=len(dataloader_val),
+                num_classes=config.MODEL.NUM_CLASSES,
+                max_real_num=config.DATA.MAX_REAL_NUM,
+                max_gen_num=config.DATA.MAX_GEN_NUM,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation fid_score: {fid_score:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save({"gen_state_dict":gen_net.state_dict(),
+                         "dis_state_dict":dis_net.state_dict()}, model_path + '.pdparams')
+            paddle.save({"gen_state_dict":gen_optimizer.state_dict(),
+                         "dis_state_dict":dis_optimizer.state_dict()}, model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+if __name__ == "__main__":
+    main()
diff --git a/gan/transGAN/metrics/Registry.py b/gan/transGAN/metrics/Registry.py
new file mode 100644
index 00000000..e1de1c66
--- /dev/null
+++ b/gan/transGAN/metrics/Registry.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import traceback
+
+
+class Registry(object):
+    """
+    The registry that provides name -> object mapping, to support third-party users' custom modules.
+    To create a registry (inside ppgan):
+    .. code-block:: python
+        BACKBONE_REGISTRY = Registry('BACKBONE')
+    To register an object:
+    .. code-block:: python
+        @BACKBONE_REGISTRY.register()
+        class MyBackbone():
+            ...
+    Or:
+    .. code-block:: python
+        BACKBONE_REGISTRY.register(MyBackbone)
+    """
+    def __init__(self, name):
+        """
+        Args:
+            name (str): the name of this registry
+        """
+        self._name = name
+
+        self._obj_map = {}
+
+    def _do_register(self, name, obj):
+        assert (
+            name not in self._obj_map
+        ), "An object named '{}' was already registered in '{}' registry!".format(
+            name, self._name)
+        self._obj_map[name] = obj
+
+    def register(self, obj=None, name=None):
+        """
+        Register the given object under the the name `obj.__name__`.
+        Can be used as either a decorator or not. See docstring of this class for usage.
+        """
+        if obj is None:
+            # used as a decorator
+            def deco(func_or_class, name=name):
+                if name is None:
+                    name = func_or_class.__name__
+                self._do_register(name, func_or_class)
+                return func_or_class
+
+            return deco
+
+        # used as a function call
+        if name is None:
+            name = obj.__name__
+        self._do_register(name, obj)
+
+    def get(self, name):
+        ret = self._obj_map.get(name)
+        if ret is None:
+            raise KeyError(
+                "No object named '{}' found in '{}' registry!".format(
+                    name, self._name))
+
+        return ret
+
+
+def build_from_config(cfg, registry, default_args=None):
+    """Build a class from config dict.
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "name".
+        registry (ppgan.utils.Registry): The registry to search the name from.
+        default_args (dict, optional): Default initialization arguments.
+    Returns:
+        class: The constructed class.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'name' not in cfg:
+        if default_args is None or 'name' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "name", '
+                f'but got {cfg}\n{default_args}')
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be an ppgan.utils.Registry object, '
+                        f'but got {type(registry)}')
+    if not (isinstance(default_args, dict) or default_args is None):
+        raise TypeError('default_args must be a dict or None, '
+                        f'but got {type(default_args)}')
+
+    args = cfg.copy()
+
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+
+    cls_name = args.pop('name')
+    if isinstance(cls_name, str):
+        obj_cls = registry.get(cls_name)
+    elif inspect.isclass(cls_name):
+        obj_cls = obj_cls
+    else:
+        raise TypeError(
+            f'name must be a str or valid name, but got {type(cls_name)}')
+
+    try:
+        instance = obj_cls(**args)
+    except Exception as e:
+        stack_info = traceback.format_exc()
+        print("Fail to initial class [{}] with error: "
+              "{} and stack:\n{}".format(cls_name, e, str(stack_info)))
+        raise e
+    return instance
\ No newline at end of file
diff --git a/gan/transGAN/metrics/__init__.py b/gan/transGAN/metrics/__init__.py
new file mode 100644
index 00000000..08fd7121
--- /dev/null
+++ b/gan/transGAN/metrics/__init__.py
@@ -0,0 +1,18 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .psnr_ssim import PSNR, SSIM
+from .fid import FID
+from .builder import build_metric
+from .Registry import Registry
diff --git a/gan/transGAN/metrics/builder.py b/gan/transGAN/metrics/builder.py
new file mode 100644
index 00000000..440ec3b2
--- /dev/null
+++ b/gan/transGAN/metrics/builder.py
@@ -0,0 +1,27 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import paddle
+
+from .Registry import *
+ 
+METRICS = Registry("METRIC")
+
+
+def build_metric(cfg):
+    cfg_ = cfg.copy()
+    name = cfg_.pop('name', None)
+    metric = METRICS.get(name)(**cfg_)
+    return metric
diff --git a/gan/transGAN/metrics/fid.py b/gan/transGAN/metrics/fid.py
new file mode 100644
index 00000000..60394a36
--- /dev/null
+++ b/gan/transGAN/metrics/fid.py
@@ -0,0 +1,302 @@
+#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import fnmatch
+import numpy as np
+import cv2
+import paddle
+from PIL import Image
+from cv2 import imread
+from scipy import linalg
+from .inception import InceptionV3
+from paddle.utils.download import get_weights_path_from_url
+from .builder import METRICS
+
+try:
+    from tqdm import tqdm
+except:
+
+    def tqdm(x):
+        return x
+
+
+""" based on https://github.com/mit-han-lab/gan-compression/blob/master/metric/fid_score.py
+"""
+"""
+inceptionV3 pretrain model is convert from pytorch, pretrain_model url is https://paddle-gan-models.bj.bcebos.com/params_inceptionV3.tar.gz
+"""
+INCEPTIONV3_WEIGHT_URL = "https://paddlegan.bj.bcebos.com/InceptionV3.pdparams"
+
+@METRICS.register()
+class FID(paddle.metric.Metric):
+    def __init__(self, batch_size=1, use_GPU=True, dims = 2048, premodel_path=None, model=None):
+        self.batch_size = batch_size
+        self.use_GPU = use_GPU
+        self.dims = dims
+        self.premodel_path = premodel_path
+        if model is None:
+            block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+            model = InceptionV3([block_idx], normalize_input=False)
+        if premodel_path is None:
+            premodel_path = get_weights_path_from_url(INCEPTIONV3_WEIGHT_URL)
+        self.model = model
+        param_dict = paddle.load(premodel_path)
+        self.model.load_dict(param_dict)
+        self.model.eval()
+        self.reset()   
+        
+    def reset(self):
+        self.preds = []
+        self.gts = []
+        self.results = []
+
+    def update(self, preds, gts):
+        preds_inception, gts_inception = calculate_inception_val(
+            preds, gts, self.batch_size, self.model, self.use_GPU, self.dims)
+        self.preds.append(preds_inception)
+        self.gts.append(gts_inception)
+
+    def accumulate(self):
+        self.preds = np.concatenate(self.preds, axis=0)
+        self.gts = np.concatenate(self.gts, axis=0)
+        value = calculate_fid_given_img(self.preds, self.gts)
+        self.reset() 
+        return value
+
+    def name(self):
+        return 'FID'
+
+
+def _calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    m1 = np.atleast_1d(mu1)
+    m2 = np.atleast_1d(mu2)
+
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+
+    assert mu1.shape == mu2.shape, 'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, 'Training and test covariances have different dimensions'
+
+    diff = mu1 - mu2
+
+    t = sigma1.dot(sigma2)
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+               'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+
+    return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) -
+            2 * tr_covmean)
+
+
+def _get_activations_from_ims(img, model, batch_size, dims, use_gpu):
+    n_batches = (len(img) + batch_size - 1) // batch_size
+    n_used_img = len(img)
+    
+    pred_arr = np.empty((n_used_img, dims))
+    
+    for i in tqdm(range(n_batches)):
+        start = i * batch_size
+        end = start + batch_size
+        if end > len(img):
+            end = len(img)
+        images = img[start:end]
+        # if images.shape[1] != 3:
+        #     images = images.transpose((0, 3, 1, 2))
+        
+        # images = paddle.to_tensor(images)
+        pred = model(images)[0][0]
+        pred_arr[start:end] = pred.reshape([end - start, -1]).cpu().numpy()
+    return pred_arr
+
+
+def _compute_statistic_of_img(act):
+    mu = np.mean(act, axis=0)
+    sigma = np.cov(act, rowvar=False)
+    return mu, sigma
+
+def calculate_inception_val(img_fake,
+                            img_real,
+                            batch_size,
+                            model,
+                            use_gpu = True,
+                            dims = 2048):
+    act_fake = _get_activations_from_ims(img_fake, model, batch_size, dims, use_gpu)
+    act_real = _get_activations_from_ims(img_real, model, batch_size, dims, use_gpu)
+    return act_fake, act_real
+
+def calculate_fid_given_img(act_fake, act_real):
+
+    m1, s1 = _compute_statistic_of_img(act_fake)
+    m2, s2 = _compute_statistic_of_img(act_real)
+    fid_value = _calculate_frechet_distance(m1, s1, m2, s2)
+    return fid_value
+
+
+def _get_activations(files,
+                     model,
+                     batch_size,
+                     dims,
+                     use_gpu,
+                     premodel_path,
+                     style=None):
+    if len(files) % batch_size != 0:
+        print(('Warning: number of images is not a multiple of the '
+               'batch size. Some samples are going to be ignored.'))
+    if batch_size > len(files):
+        print(('Warning: batch size is bigger than the datasets size. '
+               'Setting batch size to datasets size'))
+        batch_size = len(files)
+
+    n_batches = len(files) // batch_size
+    n_used_imgs = n_batches * batch_size
+
+    pred_arr = np.empty((n_used_imgs, dims))
+    for i in tqdm(range(n_batches)):
+        start = i * batch_size
+        end = start + batch_size
+
+        # same as stargan-v2 official implementation: resize to 256 first, then resize to 299
+        if style == 'stargan':
+            img_list = []
+            for f in files[start:end]:
+                im = Image.open(str(f)).convert('RGB')
+                if im.size[0] != 299:
+                    im = im.resize((256, 256), 2)
+                    im = im.resize((299, 299), 2)
+
+                img_list.append(np.array(im).astype('float32'))
+
+            images = np.array(img_list)
+        else:
+            images = np.array(
+                [imread(str(f)).astype(np.float32) for f in files[start:end]])
+
+        if len(images.shape) != 4:
+            images = imread(str(files[start]))
+            images = cv2.cvtColor(images, cv2.COLOR_BGR2GRAY)
+            images = np.array([images.astype(np.float32)])
+
+        images = images.transpose((0, 3, 1, 2))
+        images /= 255
+
+        # imagenet normalization
+        if style == 'stargan':
+            mean = np.array([0.485, 0.456, 0.406]).astype('float32')
+            std = np.array([0.229, 0.224, 0.225]).astype('float32')
+            images[:] = (images[:] - mean[:, None, None]) / std[:, None, None]
+
+        if style == 'stargan':
+            pred_arr[start:end] = inception_infer(images, premodel_path)
+        else:
+            with paddle.guard():
+                images = paddle.to_tensor(images)
+                param_dict, _ = paddle.load(premodel_path)
+                model.set_dict(param_dict)
+                model.eval()
+
+                pred = model(images)[0][0].numpy()
+
+                pred_arr[start:end] = pred.reshape(end - start, -1)
+
+    return pred_arr
+
+
+def inception_infer(x, model_path):
+    exe = paddle.static.Executor()
+    [inference_program, feed_target_names,
+     fetch_targets] = paddle.static.load_inference_model(model_path, exe)
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: x},
+                      fetch_list=fetch_targets)
+    return results[0]
+
+
+def _calculate_activation_statistics(files,
+                                     model,
+                                     premodel_path,
+                                     batch_size=50,
+                                     dims=2048,
+                                     use_gpu=False,
+                                     style=None):
+    act = _get_activations(files, model, batch_size, dims, use_gpu,
+                           premodel_path, style)
+    mu = np.mean(act, axis=0)
+    sigma = np.cov(act, rowvar=False)
+    return mu, sigma
+
+
+def _compute_statistics_of_path(path,
+                                model,
+                                batch_size,
+                                dims,
+                                use_gpu,
+                                premodel_path,
+                                style=None):
+    if path.endswith('.npz'):
+        f = np.load(path)
+        m, s = f['mu'][:], f['sigma'][:]
+        f.close()
+    else:
+        files = []
+        for root, dirnames, filenames in os.walk(path):
+            for filename in fnmatch.filter(
+                    filenames, '*.jpg') or fnmatch.filter(filenames, '*.png'):
+                files.append(os.path.join(root, filename))
+        m, s = _calculate_activation_statistics(files, model, premodel_path,
+                                                batch_size, dims, use_gpu,
+                                                style)
+    return m, s
+
+
+def calculate_fid_given_paths(paths,
+                              premodel_path,
+                              batch_size,
+                              use_gpu,
+                              dims,
+                              model=None,
+                              style=None):
+    assert os.path.exists(
+        premodel_path
+    ), 'pretrain_model path {} is not exists! Please download it first'.format(
+        premodel_path)
+    for p in paths:
+        if not os.path.exists(p):
+            raise RuntimeError('Invalid path: %s' % p)
+
+    if model is None and style != 'stargan':
+        with paddle.guard():
+            block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+            model = InceptionV3([block_idx], class_dim=1008)
+
+    m1, s1 = _compute_statistics_of_path(paths[0], model, batch_size, dims,
+                                         use_gpu, premodel_path, style)
+    m2, s2 = _compute_statistics_of_path(paths[1], model, batch_size, dims,
+                                         use_gpu, premodel_path, style)
+
+    fid_value = _calculate_frechet_distance(m1, s1, m2, s2)
+    return fid_value
+
diff --git a/gan/transGAN/metrics/inception.py b/gan/transGAN/metrics/inception.py
new file mode 100644
index 00000000..b98f2fc6
--- /dev/null
+++ b/gan/transGAN/metrics/inception.py
@@ -0,0 +1,747 @@
+#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+from paddle.nn import Conv2D, AvgPool2D, MaxPool2D, BatchNorm, Linear, AdaptiveAvgPool2D
+
+__all__ = ['InceptionV3']
+
+
+class InceptionV3(nn.Layer):
+    DEFAULT_BLOCK_INDEX = 3
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,  # First max pooling features
+        192: 1,  # Second max pooling featurs
+        768: 2,  # Pre-aux classifier features
+        2048: 3  # Final average pooling features
+    }
+
+    def __init__(self,
+                 output_blocks=[DEFAULT_BLOCK_INDEX],
+                 class_dim=1000,
+                 aux_logits=False,
+                 resize_input=True,
+                 normalize_input=True):
+        super(InceptionV3, self).__init__()
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+        self.class_dim = class_dim
+        self.aux_logits = aux_logits
+
+        assert self.last_needed_block <= 3, 'Last possible output block index is 3'
+        self.blocks = []
+
+        self.Conv2d_1a_3x3 = ConvBNLayer(3,
+                                         32,
+                                         3,
+                                         stride=2,
+                                         name='Conv2d_1a_3x3')
+        self.Conv2d_2a_3x3 = ConvBNLayer(32, 32, 3, name='Conv2d_2a_3x3')
+        self.Conv2d_2b_3x3 = ConvBNLayer(32,
+                                         64,
+                                         3,
+                                         padding=1,
+                                         name='Conv2d_2b_3x3')
+        self.maxpool1 = MaxPool2D(kernel_size=3, stride=2)
+
+        block0 = [
+            self.Conv2d_1a_3x3, self.Conv2d_2a_3x3, self.Conv2d_2b_3x3,
+            self.maxpool1
+        ]
+        self.blocks.append(nn.Sequential(*block0))
+        ### block1
+
+        if self.last_needed_block >= 1:
+            self.Conv2d_3b_1x1 = ConvBNLayer(64, 80, 1, name='Conv2d_3b_1x1')
+            self.Conv2d_4a_3x3 = ConvBNLayer(80, 192, 3, name='Conv2d_4a_3x3')
+            self.maxpool2 = MaxPool2D(kernel_size=3, stride=2)
+            block1 = [self.Conv2d_3b_1x1, self.Conv2d_4a_3x3, self.maxpool2]
+            self.blocks.append(nn.Sequential(*block1))
+
+        ### block2
+        ### Mixed_5b 5c 5d
+        if self.last_needed_block >= 2:
+            self.Mixed_5b = Fid_inceptionA(192,
+                                           pool_features=32,
+                                           name='Mixed_5b')
+            self.Mixed_5c = Fid_inceptionA(256,
+                                           pool_features=64,
+                                           name='Mixed_5c')
+            self.Mixed_5d = Fid_inceptionA(288,
+                                           pool_features=64,
+                                           name='Mixed_5d')
+
+            ### Mixed_6
+            self.Mixed_6a = InceptionB(288, name='Mixed_6a')
+            self.Mixed_6b = Fid_inceptionC(768, c7=128, name='Mixed_6b')
+            self.Mixed_6c = Fid_inceptionC(768, c7=160, name='Mixed_6c')
+            self.Mixed_6d = Fid_inceptionC(768, c7=160, name='Mixed_6d')
+            self.Mixed_6e = Fid_inceptionC(768, c7=192, name='Mixed_6e')
+
+            block2 = [
+                self.Mixed_5b, self.Mixed_5c, self.Mixed_5d, self.Mixed_6a,
+                self.Mixed_6b, self.Mixed_6c, self.Mixed_6d, self.Mixed_6e
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+
+        if self.aux_logits:
+            self.AuxLogits = InceptionAux(768, self.class_dim, name='AuxLogits')
+        ### block3
+        ### Mixed_7
+        if self.last_needed_block >= 3:
+            self.Mixed_7a = InceptionD(768, name='Mixed_7a')
+            self.Mixed_7b = Fid_inceptionE_1(1280, name='Mixed_7b')
+            self.Mixed_7c = Fid_inceptionE_2(2048, name='Mixed_7c')
+            self.avgpool = AdaptiveAvgPool2D(output_size=1)
+
+            block3 = [self.Mixed_7a, self.Mixed_7b, self.Mixed_7c, self.avgpool]
+            self.blocks.append(nn.Sequential(*block3))
+
+    def forward(self, x):
+        out = []
+        aux = None
+        if self.resize_input:
+            x = nn.functional.interpolate(x, 
+                                          size=[299, 299], 
+                                          mode='bilinear', 
+                                          align_corners=False, 
+                                          align_mode=0)
+
+        if self.normalize_input:
+            x = x * 2 - 1
+
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if self.aux_logits and (idx == 2):
+                aux = self.AuxLogits(x)
+            if idx in self.output_blocks:
+                out.append(x)
+            if idx == self.last_needed_block:
+                break
+
+        return out, aux
+
+
+class InceptionA(nn.Layer):
+    def __init__(self, in_channels, pool_features, name=None):
+        super(InceptionA, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     64,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch5x5_1 = ConvBNLayer(in_channels,
+                                       48,
+                                       1,
+                                       name=name + '.branch5x5_1')
+        self.branch5x5_2 = ConvBNLayer(48,
+                                       64,
+                                       5,
+                                       padding=2,
+                                       name=name + '.branch5x5_2')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          64,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(64,
+                                          96,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3 = ConvBNLayer(96,
+                                          96,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_3')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       pool_features,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+        return paddle.concat(
+            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+
+
+class InceptionB(nn.Layer):
+    def __init__(self, in_channels, name=None):
+        super(InceptionB, self).__init__()
+        self.branch3x3 = ConvBNLayer(in_channels,
+                                     384,
+                                     3,
+                                     stride=2,
+                                     name=name + '.branch3x3')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          64,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(64,
+                                          96,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3 = ConvBNLayer(96,
+                                          96,
+                                          3,
+                                          stride=2,
+                                          name=name + '.branch3x3dbl_3')
+
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3(x)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool(x)
+        return paddle.concat([branch3x3, branch3x3dbl, branch_pool],
+                                   axis=1)
+
+
+class InceptionC(nn.Layer):
+    def __init__(self, in_channels, c7, name=None):
+        super(InceptionC, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     192,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch7x7_1 = ConvBNLayer(in_channels,
+                                       c7,
+                                       1,
+                                       name=name + '.branch7x7_1')
+        self.branch7x7_2 = ConvBNLayer(c7,
+                                       c7, (1, 7),
+                                       padding=(0, 3),
+                                       name=name + '.branch7x7_2')
+        self.branch7x7_3 = ConvBNLayer(c7,
+                                       192, (7, 1),
+                                       padding=(3, 0),
+                                       name=name + '.branch7x7_3')
+
+        self.branch7x7dbl_1 = ConvBNLayer(in_channels,
+                                          c7,
+                                          1,
+                                          name=name + '.branch7x7dbl_1')
+        self.branch7x7dbl_2 = ConvBNLayer(c7,
+                                          c7, (7, 1),
+                                          padding=(3, 0),
+                                          name=name + '.branch7x7dbl_2')
+        self.branch7x7dbl_3 = ConvBNLayer(c7,
+                                          c7, (1, 7),
+                                          padding=(0, 3),
+                                          name=name + '.branch7x7dbl_3')
+        self.branch7x7dbl_4 = ConvBNLayer(c7,
+                                          c7, (7, 1),
+                                          padding=(3, 0),
+                                          name=name + '.branch7x7dbl_4')
+        self.branch7x7dbl_5 = ConvBNLayer(c7,
+                                          192, (1, 7),
+                                          padding=(0, 3),
+                                          name=name + '.branch7x7dbl_5')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+
+        return paddle.concat(
+            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+
+
+class InceptionD(nn.Layer):
+    def __init__(self, in_channels, name=None):
+        super(InceptionD, self).__init__()
+        self.branch3x3_1 = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch3x3_1')
+        self.branch3x3_2 = ConvBNLayer(192,
+                                       320,
+                                       3,
+                                       stride=2,
+                                       name=name + '.branch3x3_2')
+
+        self.branch7x7x3_1 = ConvBNLayer(in_channels,
+                                         192,
+                                         1,
+                                         name=name + '.branch7x7x3_1')
+        self.branch7x7x3_2 = ConvBNLayer(192,
+                                         192, (1, 7),
+                                         padding=(0, 3),
+                                         name=name + '.branch7x7x3_2')
+        self.branch7x7x3_3 = ConvBNLayer(192,
+                                         192, (7, 1),
+                                         padding=(3, 0),
+                                         name=name + '.branch7x7x3_3')
+        self.branch7x7x3_4 = ConvBNLayer(192,
+                                         192,
+                                         3,
+                                         stride=2,
+                                         name=name + '.branch7x7x3_4')
+
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+
+        branch_pool = self.branch_pool(x)
+
+        return paddle.concat([branch3x3, branch7x7x3, branch_pool],
+                                   axis=1)
+
+
+class InceptionE(nn.Layer):
+    def __init__(self, in_channels, name=None):
+        super(InceptionE, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     320,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch3x3_1 = ConvBNLayer(in_channels,
+                                       384,
+                                       1,
+                                       name=name + '.branch3x3_1')
+        self.branch3x3_2a = ConvBNLayer(384,
+                                        384, (1, 3),
+                                        padding=(0, 1),
+                                        name=name + '.branch3x3_2a')
+        self.branch3x3_2b = ConvBNLayer(384,
+                                        384, (3, 1),
+                                        padding=(1, 0),
+                                        name=name + '.branch3x3_2b')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          448,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(448,
+                                          384,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3a = ConvBNLayer(384,
+                                           384, (1, 3),
+                                           padding=(0, 1),
+                                           name=name + '.branch3x3dbl_3a')
+        self.branch3x3dbl_3b = ConvBNLayer(384,
+                                           384, (3, 1),
+                                           padding=(1, 0),
+                                           name=name + '.branch3x3dbl_3b')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3_1 = self.branch3x3_1(x)
+        branch3x3_2a = self.branch3x3_2a(branch3x3_1)
+        branch3x3_2b = self.branch3x3_2b(branch3x3_1)
+        branch3x3 = paddle.concat([branch3x3_2a, branch3x3_2b], axis=1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl_3a = self.branch3x3dbl_3a(branch3x3dbl)
+        branch3x3dbl_3b = self.branch3x3dbl_3b(branch3x3dbl)
+        branch3x3dbl = paddle.concat([branch3x3dbl_3a, branch3x3dbl_3b],
+                                           axis=1)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+
+        return paddle.concat(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+
+
+class InceptionAux(nn.Layer):
+    def __init__(self, in_channels, num_classes, name=None):
+        super(InceptionAux, self).__init__()
+        self.num_classes = num_classes
+        self.pool0 = AvgPool2D(kernel_size=5, stride=3)
+        self.conv0 = ConvBNLayer(in_channels, 128, 1, name=name + '.conv0')
+        self.conv1 = ConvBNLayer(128, 768, 5, name=name + '.conv1')
+        self.pool1 = AvgPool2D(global_pooling=True)
+
+    def forward(self, x):
+        x = self.pool0(x)
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.pool1(x)
+        x = paddle.flatten(x, axis=1)
+        x = paddle.static.nn.fc(x, size=self.num_classes)
+        return x
+
+
+class Fid_inceptionA(nn.Layer):
+    """ FID block in inception v3
+    """
+    def __init__(self, in_channels, pool_features, name=None):
+        super(Fid_inceptionA, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     64,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch5x5_1 = ConvBNLayer(in_channels,
+                                       48,
+                                       1,
+                                       name=name + '.branch5x5_1')
+        self.branch5x5_2 = ConvBNLayer(48,
+                                       64,
+                                       5,
+                                       padding=2,
+                                       name=name + '.branch5x5_2')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          64,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(64,
+                                          96,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3 = ConvBNLayer(96,
+                                          96,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_3')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       pool_features,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+        return paddle.concat(
+            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+
+
+class Fid_inceptionC(nn.Layer):
+    """ FID block in inception v3
+    """
+    def __init__(self, in_channels, c7, name=None):
+        super(Fid_inceptionC, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     192,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch7x7_1 = ConvBNLayer(in_channels,
+                                       c7,
+                                       1,
+                                       name=name + '.branch7x7_1')
+        self.branch7x7_2 = ConvBNLayer(c7,
+                                       c7, (1, 7),
+                                       padding=(0, 3),
+                                       name=name + '.branch7x7_2')
+        self.branch7x7_3 = ConvBNLayer(c7,
+                                       192, (7, 1),
+                                       padding=(3, 0),
+                                       name=name + '.branch7x7_3')
+
+        self.branch7x7dbl_1 = ConvBNLayer(in_channels,
+                                          c7,
+                                          1,
+                                          name=name + '.branch7x7dbl_1')
+        self.branch7x7dbl_2 = ConvBNLayer(c7,
+                                          c7, (7, 1),
+                                          padding=(3, 0),
+                                          name=name + '.branch7x7dbl_2')
+        self.branch7x7dbl_3 = ConvBNLayer(c7,
+                                          c7, (1, 7),
+                                          padding=(0, 3),
+                                          name=name + '.branch7x7dbl_3')
+        self.branch7x7dbl_4 = ConvBNLayer(c7,
+                                          c7, (7, 1),
+                                          padding=(3, 0),
+                                          name=name + '.branch7x7dbl_4')
+        self.branch7x7dbl_5 = ConvBNLayer(c7,
+                                          192, (1, 7),
+                                          padding=(0, 3),
+                                          name=name + '.branch7x7dbl_5')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+
+        return paddle.concat(
+            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+
+
+class Fid_inceptionE_1(nn.Layer):
+    """ FID block in inception v3
+        """
+    def __init__(self, in_channels, name=None):
+        super(Fid_inceptionE_1, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     320,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch3x3_1 = ConvBNLayer(in_channels,
+                                       384,
+                                       1,
+                                       name=name + '.branch3x3_1')
+        self.branch3x3_2a = ConvBNLayer(384,
+                                        384, (1, 3),
+                                        padding=(0, 1),
+                                        name=name + '.branch3x3_2a')
+        self.branch3x3_2b = ConvBNLayer(384,
+                                        384, (3, 1),
+                                        padding=(1, 0),
+                                        name=name + '.branch3x3_2b')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          448,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(448,
+                                          384,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3a = ConvBNLayer(384,
+                                           384, (1, 3),
+                                           padding=(0, 1),
+                                           name=name + '.branch3x3dbl_3a')
+        self.branch3x3dbl_3b = ConvBNLayer(384,
+                                           384, (3, 1),
+                                           padding=(1, 0),
+                                           name=name + '.branch3x3dbl_3b')
+
+        self.branch_pool0 = AvgPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   exclusive=True)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3_1 = self.branch3x3_1(x)
+        branch3x3_2a = self.branch3x3_2a(branch3x3_1)
+        branch3x3_2b = self.branch3x3_2b(branch3x3_1)
+        branch3x3 = paddle.concat([branch3x3_2a, branch3x3_2b], axis=1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl_3a = self.branch3x3dbl_3a(branch3x3dbl)
+        branch3x3dbl_3b = self.branch3x3dbl_3b(branch3x3dbl)
+        branch3x3dbl = paddle.concat([branch3x3dbl_3a, branch3x3dbl_3b],
+                                           axis=1)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+
+        return paddle.concat(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+
+
+class Fid_inceptionE_2(nn.Layer):
+    """ FID block in inception v3
+    """
+    def __init__(self, in_channels, name=None):
+        super(Fid_inceptionE_2, self).__init__()
+        self.branch1x1 = ConvBNLayer(in_channels,
+                                     320,
+                                     1,
+                                     name=name + '.branch1x1')
+
+        self.branch3x3_1 = ConvBNLayer(in_channels,
+                                       384,
+                                       1,
+                                       name=name + '.branch3x3_1')
+        self.branch3x3_2a = ConvBNLayer(384,
+                                        384, (1, 3),
+                                        padding=(0, 1),
+                                        name=name + '.branch3x3_2a')
+        self.branch3x3_2b = ConvBNLayer(384,
+                                        384, (3, 1),
+                                        padding=(1, 0),
+                                        name=name + '.branch3x3_2b')
+
+        self.branch3x3dbl_1 = ConvBNLayer(in_channels,
+                                          448,
+                                          1,
+                                          name=name + '.branch3x3dbl_1')
+        self.branch3x3dbl_2 = ConvBNLayer(448,
+                                          384,
+                                          3,
+                                          padding=1,
+                                          name=name + '.branch3x3dbl_2')
+        self.branch3x3dbl_3a = ConvBNLayer(384,
+                                           384, (1, 3),
+                                           padding=(0, 1),
+                                           name=name + '.branch3x3dbl_3a')
+        self.branch3x3dbl_3b = ConvBNLayer(384,
+                                           384, (3, 1),
+                                           padding=(1, 0),
+                                           name=name + '.branch3x3dbl_3b')
+        ### same with paper
+        self.branch_pool0 = MaxPool2D(kernel_size=3,
+                                   stride=1,
+                                   padding=1)
+        self.branch_pool = ConvBNLayer(in_channels,
+                                       192,
+                                       1,
+                                       name=name + '.branch_pool')
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3_1 = self.branch3x3_1(x)
+        branch3x3_2a = self.branch3x3_2a(branch3x3_1)
+        branch3x3_2b = self.branch3x3_2b(branch3x3_1)
+        branch3x3 = paddle.concat([branch3x3_2a, branch3x3_2b], axis=1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl_3a = self.branch3x3dbl_3a(branch3x3dbl)
+        branch3x3dbl_3b = self.branch3x3dbl_3b(branch3x3dbl)
+        branch3x3dbl = paddle.concat([branch3x3dbl_3a, branch3x3dbl_3b],
+                                           axis=1)
+
+        branch_pool = self.branch_pool0(x)
+        branch_pool = self.branch_pool(branch_pool)
+
+        return paddle.concat(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 act='relu',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.conv = Conv2D(in_channels=in_channels,
+                           out_channels=num_filters,
+                           kernel_size=filter_size,
+                           stride=stride,
+                           padding=padding,
+                           groups=groups,
+                           weight_attr=paddle.ParamAttr(name=name + ".conv.weight"),
+                           bias_attr=False)
+        self.bn = BatchNorm(num_filters,
+                            act=act,
+                            epsilon=0.001,
+                            param_attr=paddle.ParamAttr(name=name + ".bn.weight"),
+                            bias_attr=paddle.ParamAttr(name=name + ".bn.bias"),
+                            moving_mean_name=name + '.bn.running_mean',
+                            moving_variance_name=name + '.bn.running_var')
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        y = self.bn(y)
+        return y
diff --git a/gan/transGAN/metrics/psnr_ssim.py b/gan/transGAN/metrics/psnr_ssim.py
new file mode 100644
index 00000000..72702de0
--- /dev/null
+++ b/gan/transGAN/metrics/psnr_ssim.py
@@ -0,0 +1,334 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import paddle
+
+from .builder import METRICS
+
+
+@METRICS.register()
+class PSNR(paddle.metric.Metric):
+    def __init__(self, crop_border, input_order='HWC', test_y_channel=False):
+        self.crop_border = crop_border
+        self.input_order = input_order
+        self.test_y_channel = test_y_channel
+        self.reset()
+
+    def reset(self):
+        self.results = []
+
+    def update(self, preds, gts):
+        if not isinstance(preds, (list, tuple)):
+            preds = [preds]
+
+        if not isinstance(gts, (list, tuple)):
+            gts = [gts]
+
+        for pred, gt in zip(preds, gts):
+            value = calculate_psnr(pred, gt, self.crop_border, self.input_order,
+                                   self.test_y_channel)
+            self.results.append(value)
+
+    def accumulate(self):
+        if paddle.distributed.get_world_size() > 1:
+            results = paddle.to_tensor(self.results)
+            results_list = []
+            paddle.distributed.all_gather(results_list, results)
+            self.results = paddle.concat(results_list).numpy()
+
+        if len(self.results) <= 0:
+            return 0.
+        return np.mean(self.results)
+
+    def name(self):
+        return 'PSNR'
+
+
+@METRICS.register()
+class SSIM(PSNR):
+    def update(self, preds, gts):
+        if not isinstance(preds, (list, tuple)):
+            preds = [preds]
+
+        if not isinstance(gts, (list, tuple)):
+            gts = [gts]
+
+        for pred, gt in zip(preds, gts):
+            value = calculate_ssim(pred, gt, self.crop_border, self.input_order,
+                                   self.test_y_channel)
+            self.results.append(value)
+
+    def name(self):
+        return 'SSIM'
+
+
+def calculate_psnr(img1,
+                   img2,
+                   crop_border,
+                   input_order='HWC',
+                   test_y_channel=False):
+    """Calculate PSNR (Peak Signal-to-Noise Ratio).
+
+    Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+
+    Args:
+        img1 (ndarray): Images with range [0, 255].
+        img2 (ndarray): Images with range [0, 255].
+        crop_border (int): Cropped pixels in each edge of an image. These
+            pixels are not involved in the PSNR calculation.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            Default: 'HWC'.
+        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
+
+    Returns:
+        float: psnr result.
+    """
+
+    assert img1.shape == img2.shape, (
+        f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f'Wrong input_order {input_order}. Supported input_orders are '
+            '"HWC" and "CHW"')
+    img1 = img1.copy().astype('float32')
+    img2 = img2.copy().astype('float32')
+    img1 = reorder_image(img1, input_order=input_order)
+    img2 = reorder_image(img2, input_order=input_order)
+
+    if crop_border != 0:
+        img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
+
+    if test_y_channel:
+        img1 = to_y_channel(img1)
+        img2 = to_y_channel(img2)
+
+    mse = np.mean((img1 - img2)**2)
+    if mse == 0:
+        return float('inf')
+    return 20. * np.log10(255. / np.sqrt(mse))
+
+
+def _ssim(img1, img2):
+    """Calculate SSIM (structural similarity) for one channel images.
+
+    It is called by func:`calculate_ssim`.
+
+    Args:
+        img1 (ndarray): Images with range [0, 255] with order 'HWC'.
+        img2 (ndarray): Images with range [0, 255] with order 'HWC'.
+
+    Returns:
+        float: ssim result.
+    """
+
+    C1 = (0.01 * 255)**2
+    C2 = (0.03 * 255)**2
+
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+    kernel = cv2.getGaussianKernel(11, 1.5)
+    window = np.outer(kernel, kernel.transpose())
+
+    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]
+    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
+    mu1_sq = mu1**2
+    mu2_sq = mu2**2
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
+    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
+    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
+
+    ssim_map = ((2 * mu1_mu2 + C1) *
+                (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
+                                       (sigma1_sq + sigma2_sq + C2))
+    return ssim_map.mean()
+
+
+def calculate_ssim(img1,
+                   img2,
+                   crop_border,
+                   input_order='HWC',
+                   test_y_channel=False):
+    """Calculate SSIM (structural similarity).
+
+    Ref:
+    Image quality assessment: From error visibility to structural similarity
+
+    The results are the same as that of the official released MATLAB code in
+    https://ece.uwaterloo.ca/~z70wang/research/ssim/.
+
+    For three-channel images, SSIM is calculated for each channel and then
+    averaged.
+
+    Args:
+        img1 (ndarray): Images with range [0, 255].
+        img2 (ndarray): Images with range [0, 255].
+        crop_border (int): Cropped pixels in each edge of an image. These
+            pixels are not involved in the SSIM calculation.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            Default: 'HWC'.
+        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
+
+    Returns:
+        float: ssim result.
+    """
+
+    assert img1.shape == img2.shape, (
+        f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f'Wrong input_order {input_order}. Supported input_orders are '
+            '"HWC" and "CHW"')
+
+    img1 = img1.copy().astype('float32')[..., ::-1]
+    img2 = img2.copy().astype('float32')[..., ::-1]
+
+    img1 = reorder_image(img1, input_order=input_order)
+    img2 = reorder_image(img2, input_order=input_order)
+
+    if crop_border != 0:
+        img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
+
+    if test_y_channel:
+        img1 = to_y_channel(img1)
+        img2 = to_y_channel(img2)
+
+    ssims = []
+    for i in range(img1.shape[2]):
+        ssims.append(_ssim(img1[..., i], img2[..., i]))
+    return np.array(ssims).mean()
+
+
+def reorder_image(img, input_order='HWC'):
+    """Reorder images to 'HWC' order.
+
+    If the input_order is (h, w), return (h, w, 1);
+    If the input_order is (c, h, w), return (h, w, c);
+    If the input_order is (h, w, c), return as it is.
+
+    Args:
+        img (ndarray): Input image.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            If the input image shape is (h, w), input_order will not have
+            effects. Default: 'HWC'.
+
+    Returns:
+        ndarray: reordered image.
+    """
+
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f'Wrong input_order {input_order}. Supported input_orders are '
+            "'HWC' and 'CHW'")
+    if len(img.shape) == 2:
+        img = img[..., None]
+        return img
+    if input_order == 'CHW':
+        img = img.transpose(1, 2, 0)
+    return img
+
+
+def bgr2ycbcr(img, y_only=False):
+    """Convert a BGR image to YCbCr image.
+
+    The bgr version of rgb2ycbcr.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+            and range as input image.
+    """
+    img_type = img.dtype
+
+    if y_only:
+        out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
+    return out_img
+
+
+def rgb2ycbcr(img, y_only=False):
+    """Convert a RGB image to YCbCr image.
+
+    The RGB version of rgb2ycbcr.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+            and range as input image.
+    """
+    img_type = img.dtype
+
+    if img_type != np.uint8:
+        img *= 255.
+
+    if y_only:
+        out_img = np.dot(img, [65.481, 128.553, 24.966]) / 255. + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
+
+    if img_type != np.uint8:
+        out_img /= 255.
+    else:
+        out_img = out_img.round()
+
+    return out_img
+
+
+def to_y_channel(img):
+    """Change to Y channel of YCbCr.
+
+    Args:
+        img (ndarray): Images with range [0, 255].
+
+    Returns:
+        (ndarray): Images with range [0, 255] (float type) without round.
+    """
+    img = img.astype(np.float32) / 255.
+    if img.ndim == 3 and img.shape[2] == 3:
+        img = rgb2ycbcr(img, y_only=True)
+        img = img[..., None]
+    return img * 255.
diff --git a/gan/transGAN/models/ViT_custom.py b/gan/transGAN/models/ViT_custom.py
new file mode 100644
index 00000000..2730987e
--- /dev/null
+++ b/gan/transGAN/models/ViT_custom.py
@@ -0,0 +1,432 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement transGAN_custom
+"""
+
+import paddle
+import paddle.nn as nn
+from utils import trunc_normal_
+from utils import gelu
+from utils import pixel_upsample
+from utils import drop_path
+
+class Identity(nn.Layer):
+    """ Identity layer
+    The output of this layer is the input without any change.
+    Use this layer to avoid if condition in some forward methods
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return x
+
+class matmul(nn.Layer):
+    """ matmul layer
+
+    Matrix-vector multiplication, like np.dot(x1, x2)
+
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x1, x2):
+        x = x1@x2
+        return x
+
+class PixelNorm(nn.Layer):
+    """ PixelNorm layer
+
+    Pixel level norm
+
+    """
+    def __init__(self, dim):
+        super().__init__()
+
+    def forward(self, input):
+        return input * paddle.rsqrt(paddle.mean(input ** 2, dim=2, keepdim=True) + 1e-8)
+
+class CustomNorm(nn.Layer):
+    """ CustomNorm layer
+
+    Custom norm method set, defalut "ln"
+
+    """
+    def __init__(self, norm_layer, dim):
+        super().__init__()
+        self.norm_type = norm_layer
+        if norm_layer == "ln":
+            self.norm = nn.LayerNorm(dim)
+        elif norm_layer == "bn":
+            self.norm = nn.BatchNorm1D(dim)
+        elif norm_layer == "in":
+            self.norm = nn.InstanceNorm1D(dim)
+        elif norm_layer == "pn":
+            self.norm = PixelNorm(dim)
+
+    def forward(self, x):
+        if self.norm_type == "bn" or self.norm_type == "in":
+            x = self.norm(x.transpose((0, 2, 1))).transpose((0, 2, 1))
+            return x
+        elif self.norm_type == "none":
+            return x
+        else:
+            return self.norm(x)
+
+class CustomAct(nn.Layer):
+    """ CustomAct layer
+
+    Custom act method set, defalut "gelu"
+
+    """
+    def __init__(self, act_layer):
+        super().__init__()
+        if act_layer == "gelu":
+            self.act_layer = gelu
+        elif act_layer == "leakyrelu":
+            self.act_layer = leakyrelu
+        else:
+            self.act_layer = gelu
+
+    def forward(self, x):
+        return self.act_layer(x)
+
+class Mlp(nn.Layer):
+    """ mlp layer
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+
+    """
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=gelu,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = CustomAct(act_layer)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+class Attention(nn.Layer):
+    """ attention layer
+
+    Attention module for ViT, here q, k, v are assumed the same.
+    The qkv mappings are stored as one single param.
+    Attributes:
+        num_heads: number of heads
+        qkv_bias: a nn.Linear for q, k, v mapping
+        qk_scale: 1 / sqrt(single_head_feature_dim)
+        attn_drop: dropout for attention
+        proj_drop: final dropout before output
+        softmax: softmax op for attention
+        window_size: window_size
+
+    """
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=16):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.mat = matmul()
+        self.window_size = window_size
+        if self.window_size != 0:
+            zeros_ = nn.initializer.Constant(value=0.)
+             # 2*Wh-1 * 2*Ww-1, nH
+            self.relative_position_bias_table = self.create_parameter(
+                shape=((2 * window_size - 1) * (2 * window_size - 1), num_heads),
+                default_initializer=zeros_
+            )
+            # get pair-wise relative position index for each token inside the window
+            coords_h = paddle.arange(window_size)
+            coords_w = paddle.arange(window_size)
+            coords = paddle.stack(paddle.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+            # 2, Wh*Ww, Wh*Ww
+            relative_coords = coords_flatten.unsqueeze(2) - coords_flatten.unsqueeze(1)
+            relative_coords = relative_coords.transpose([1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size - 1
+            relative_coords[:, :, 0] *= 2 * window_size - 1
+            relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            self.register_buffer("relative_position_index", relative_position_index)
+            trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape([B, N, 3, self.num_heads, C // self.num_heads])
+        qkv = qkv.transpose([2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        attn = (self.mat(q, k.transpose([0, 1, 3, 2]))) * self.scale
+        if self.window_size != 0:
+            relative_position_bias = paddle.index_select(
+                self.relative_position_bias_table,
+                self.relative_position_index.flatten().clone())
+            relative_position_bias = relative_position_bias.reshape((
+                self.window_size * self.window_size,
+                self.window_size * self.window_size,
+                -1))  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        attn = paddle.nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+        x = self.mat(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+class Block(nn.Layer):
+    """ block layer
+    Make up the basic unit of the network
+
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=gelu,
+                 norm_layer=nn.LayerNorm,
+                 window_size=16):
+        super().__init__()
+        self.norm1 = CustomNorm(norm_layer, dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = CustomNorm(norm_layer, dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+class StageBlock(nn.Layer):
+    """ stageblock layer
+    Organize Block
+
+    """
+    def __init__(self,
+                 depth,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=gelu,
+                 norm_layer=nn.LayerNorm,
+                 window_size=16):
+        super().__init__()
+        self.depth = depth
+        self.block = nn.LayerList([
+            Block(
+                dim=dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                window_size=window_size
+            ) for i in range(depth)])
+
+    def forward(self, x):
+        for blk in self.block:
+            x = blk(x)
+        return x
+
+
+
+class Generator(nn.Layer):
+    """ generator layer
+
+    Generator module for transGAN
+    Attributes:
+        args: args
+        embed_dim: the dim of embedding dim
+        depth: the block's depth
+        num_heads: number of MLP heads
+        mlp_ratio: decide the mlp_hidden_dim, defalut 4
+        qkv_bias: a nn.Linear for q, k, v mapping
+        qk_scale: 1 / sqrt(single_head_feature_dim)
+        drop_rate: the dropout before output
+        attn_drop_rate:  dropout for attention
+        drop_path_rate: the dropout before output
+        hybrid_backbone: if there some hybrid_backbone
+        norm_layer: which norm method
+
+    """
+    def __init__(self,
+                 args,
+                 embed_dim=384,
+                 depth=5,
+                 num_heads=4,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 norm_layer="ln"):
+        super().__init__()
+        self.args = args
+        self.ch = embed_dim
+        self.bottom_width = args.MODEL.BOTTOM_WIDTH
+        self.embed_dim = embed_dim = args.MODEL.GF_DIM
+        norm_layer = args.MODEL.G_NORM
+        mlp_ratio = args.MODEL.G_MLP
+        depth = [int(i) for i in args.MODEL.G_DEPTH.split(",")]
+        act_layer = args.MODEL.G_ACT
+
+        zeros_ = nn.initializer.Constant(value=0.)
+        self.l1 = nn.Linear(args.MODEL.LATENT_DIM, (self.bottom_width ** 2) * self.embed_dim)
+        self.pos_embed_1 = self.create_parameter(
+            shape=(1, self.bottom_width**2, embed_dim), default_initializer=zeros_)
+        self.pos_embed_2 = self.create_parameter(
+            shape=(1, (self.bottom_width*2)**2, embed_dim//4), default_initializer=zeros_)
+        self.pos_embed_3 = self.create_parameter(
+            shape=(1, (self.bottom_width*4)**2, embed_dim//16), default_initializer=zeros_)
+        self.pos_embed = [
+            self.pos_embed_1,
+            self.pos_embed_2,
+            self.pos_embed_3
+        ]
+        self.blocks = StageBlock(
+            depth=depth[0],
+            dim=embed_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            drop=drop_rate,
+            attn_drop=attn_drop_rate,
+            drop_path=0,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            window_size=8)
+        self.upsample_blocks = nn.LayerList([
+            StageBlock(
+                depth=depth[1],
+                dim=embed_dim//4,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=0,
+                act_layer=act_layer,
+                norm_layer=norm_layer
+                ),
+            StageBlock(
+                depth=depth[2],
+                dim=embed_dim//16,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=0,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                window_size=32
+                )
+        ])
+        for i in range(len(self.pos_embed)):
+            trunc_normal_(self.pos_embed[i], std=.02)
+        self.deconv = nn.Sequential(
+            nn.Conv2D(self.embed_dim//16, 3, 1, 1, 0)
+        )
+
+    def set_arch(self, x, cur_stage):
+        pass
+
+    def forward(self, z, epoch):
+        if self.args.LATENT_NORM:
+            latent_size = z.shape[-1]
+            z = (z/z.norm(axis=-1, keepdim=True) * (latent_size ** 0.5))
+
+        x = self.l1(z).reshape((-1, self.bottom_width ** 2, self.embed_dim))
+        x = x + self.pos_embed[0]
+        H, W = self.bottom_width, self.bottom_width
+        x = self.blocks(x)
+        for index, blk in enumerate(self.upsample_blocks):
+            x, H, W = pixel_upsample(x, H, W)
+            x = x + self.pos_embed[index+1]
+            x = blk(x)
+        output = self.deconv(x.transpose((0, 2, 1)).reshape((-1, self.embed_dim//16, H, W)))
+        return output
diff --git a/gan/transGAN/models/ViT_custom_scale2.py b/gan/transGAN/models/ViT_custom_scale2.py
new file mode 100644
index 00000000..206009d3
--- /dev/null
+++ b/gan/transGAN/models/ViT_custom_scale2.py
@@ -0,0 +1,315 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement ViT_custom_scale2
+"""
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from utils import trunc_normal_
+from utils import gelu
+from utils import pixel_upsample
+from utils import drop_path
+from utils import DiffAugment
+from utils import leakyrelu
+from utils import normal_
+from utils import uniform_
+from utils import constant_
+from models.ViT_custom import Identity
+from models.ViT_custom import matmul
+from models.ViT_custom import PixelNorm
+from models.ViT_custom import CustomNorm
+from models.ViT_custom import Mlp
+from models.ViT_custom import CustomAct
+from models.ViT_custom import DropPath
+
+class Attention(nn.Layer):
+    """ attention layer
+
+    Attention module for ViT, here q, k, v are assumed the same.
+    The qkv mappings are stored as one single param.
+    Attributes:
+        dim: defalut embedding dim, set in config
+        num_heads: number of heads
+        qkv_bias: a nn.Linear for q, k, v mapping
+        qk_scale: 1 / sqrt(single_head_feature_dim)
+        attn_drop: dropout for attention
+        proj_drop: final dropout before output
+        softmax: softmax op for attention
+        window_size: attention size
+
+    """
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=16):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.mat = matmul()
+        self.window_size = window_size
+        zeros_ = nn.initializer.Constant(value=0.)
+        self.noise_strength_1 = self.create_parameter(shape=[1], default_initializer=zeros_)
+
+        if self.window_size != 0:
+            zeros_ = nn.initializer.Constant(value=0.)
+            self.relative_position_bias_table = self.create_parameter(
+                shape=((2 * window_size - 1) * (2 * window_size - 1),
+                       num_heads), default_initializer=zeros_)
+            # get pair-wise relative position index for each token inside the window
+            coords_h = paddle.arange(window_size)
+            coords_w = paddle.arange(window_size)
+            coords = paddle.stack(paddle.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+            # 2, Wh*Ww, Wh*Ww
+            relative_coords = coords_flatten.unsqueeze(2) - coords_flatten.unsqueeze(1)
+            relative_coords = relative_coords.transpose([1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size - 1
+            relative_coords[:, :, 0] *= 2 * window_size - 1
+            relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            self.register_buffer("relative_position_index", relative_position_index)
+            trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        x = x + paddle.randn([x.shape[0], x.shape[1], 1]) * self.noise_strength_1
+        qkv = self.qkv(x).reshape([B, N, 3, self.num_heads, C // self.num_heads])
+        qkv = qkv.transpose([2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        attn = (self.mat(q, k.transpose([0, 1, 3, 2]))) * self.scale
+        if self.window_size != 0:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.flatten().clone()]
+            relative_position_bias = relative_position_bias.reshape((
+                self.window_size * self.window_size,
+                self.window_size * self.window_size,
+                -1))
+            # nH, Wh*Ww, Wh*Ww
+            relative_position_bias = relative_position_bias.transpose((2, 0, 1))
+            attn = attn + relative_position_bias.unsqueeze(0)
+        attn = paddle.nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+        x = self.mat(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+class DisBlock(nn.Layer):
+    """ block layer
+    Make up the basic unit of the network
+
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=leakyrelu,
+                 norm_layer=nn.LayerNorm,
+                 window_size=16):
+        super().__init__()
+        self.norm1 = CustomNorm(norm_layer, dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias,
+            qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop,
+            window_size=window_size)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = CustomNorm(norm_layer, dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.gain = np.sqrt(0.5) if norm_layer == "none" else 1
+
+    def forward(self, x):
+        x = x*self.gain + self.drop_path(self.attn(self.norm1(x)))*self.gain
+        x = x*self.gain + self.drop_path(self.mlp(self.norm2(x)))*self.gain
+        return x
+
+
+class Discriminator(nn.Layer):
+    """ Discriminator layer
+
+    Discriminator module for transGAN
+    Attributes:
+        args: the input args
+        img_size: the size of img
+        patch_size: the patch size of the attention
+        num_classes: the num of class, There are actually only two
+        embed_dim: the dim of embedding dim
+        depth: the block depth
+        num_heads: number of heads
+        mlp_ratio: decide the mlp_hidden_dim, defalut 4
+        qkv_bias: a nn.Linear for q, k, v mapping
+        qk_scale: 1 / sqrt(single_head_feature_dim)
+        drop_rate: the dropout before output
+        attn_drop_rate:  dropout for attention
+        drop_path_rate: the dropout before output
+        norm_layer: which norm method
+
+    """
+    def __init__(self,
+                 args,
+                 patch_size=None,
+                 num_classes=1,
+                 embed_dim=None,
+                 depth=7,
+                 num_heads=4,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = embed_dim = self.embed_dim = args.MODEL.DF_DIM
+        depth = args.MODEL.D_DEPTH
+        self.args = args
+        self.patch_size = patch_size = args.MODEL.PATCH_SIZE
+        norm_layer = args.MODEL.D_NORM
+        self.window_size = args.MODEL.D_WINDOW_SIZE
+        act_layer = args.MODEL.D_ACT
+        self.fRGB_1 = nn.Conv2D(3,
+                                embed_dim//4*3,
+                                kernel_size=patch_size,
+                                stride=patch_size, padding=0)
+        self.fRGB_2 = nn.Conv2D(3,
+                                embed_dim//4,
+                                kernel_size=patch_size*2,
+                                stride=patch_size*2,
+                                padding=0)
+
+        num_patches_1 = (args.DATA.IMAGE_SIZE // patch_size)**2
+        num_patches_2 = ((args.DATA.IMAGE_SIZE//2) // patch_size)**2
+
+        zeros_ = nn.initializer.Constant(value=0.)
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim), default_initializer=zeros_)
+        self.pos_embed_1 = self.create_parameter(
+            shape=(1, int(num_patches_1), embed_dim//4*3), default_initializer=zeros_)
+        self.pos_embed_2 = self.create_parameter(
+            shape=(1, int(num_patches_2), embed_dim), default_initializer=zeros_)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth decay rule
+        dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, depth)]
+        self.blocks_1 = nn.LayerList([
+            DisBlock(dim=embed_dim//4*3,
+                     num_heads=num_heads,
+                     mlp_ratio=mlp_ratio,
+                     qkv_bias=qkv_bias,
+                     qk_scale=qk_scale,
+                     drop=drop_rate,
+                     attn_drop=attn_drop_rate,
+                     drop_path=0,
+                     act_layer=act_layer,
+                     norm_layer=norm_layer,
+                     window_size=args.MODEL.BOTTOM_WIDTH*4//2) for i in range(depth)])
+        self.blocks_2 = nn.LayerList([
+            DisBlock(dim=embed_dim,
+                     num_heads=num_heads,
+                     mlp_ratio=mlp_ratio,
+                     qkv_bias=qkv_bias,
+                     qk_scale=qk_scale,
+                     drop=drop_rate,
+                     attn_drop=attn_drop_rate,
+                     drop_path=0,
+                     act_layer=act_layer,
+                     norm_layer=norm_layer,
+                     window_size=args.MODEL.BOTTOM_WIDTH*4//4) for i in range(depth)])
+
+        self.last_block = nn.Sequential(
+            DisBlock(dim=embed_dim,
+                     num_heads=num_heads,
+                     mlp_ratio=mlp_ratio,
+                     qkv_bias=qkv_bias,
+                     qk_scale=qk_scale,
+                     drop=drop_rate,
+                     attn_drop=attn_drop_rate,
+                     drop_path=dpr[0],
+                     act_layer=act_layer,
+                     norm_layer=norm_layer,
+                     window_size=0)
+            )
+
+        self.norm = CustomNorm(norm_layer, embed_dim)
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else Identity()
+
+        trunc_normal_(self.pos_embed_1, std=.02)
+        trunc_normal_(self.pos_embed_2, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+        self.Hz_fbank = None
+        self.Hz_geom = None
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            constant_(m.bias, 0)
+            constant_(m.weight, 1.0)
+
+    def forward_features(self, x, aug=True, epoch=0):
+        if "None" not in self.args.DATA.DIFF_AUG and aug:
+            x = DiffAugment(x, self.args.DATA.DIFF_AUG, True, [self.Hz_geom, self.Hz_fbank])
+        B, _, H, W = x.shape
+        H = W = H//self.patch_size
+        x_1 = self.fRGB_1(x).flatten(2).transpose([0, 2, 1])
+        x_2 = self.fRGB_2(x).flatten(2).transpose([0, 2, 1])
+        B = x.shape[0]
+        x = x_1 + self.pos_embed_1
+        B, _, C = x.shape
+        for blk in self.blocks_1:
+            x = blk(x)
+        _, _, C = x.shape
+        x = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+        x = nn.AvgPool2D(2)(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose([0, 2, 1])
+        x = paddle.concat([x, x_2], axis=-1)
+        x = x + self.pos_embed_2
+        for blk in self.blocks_2:
+            x = blk(x)
+        cls_tokens = self.cls_token.expand([B, -1, -1])
+        x = paddle.concat((cls_tokens, x), axis=1)
+        x = self.last_block(x)
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(self, x, aug=True, epoch=0):
+        x = self.forward_features(x, aug=aug, epoch=epoch)
+        x = self.head(x)
+        return x
diff --git a/gan/transGAN/readme.md b/gan/transGAN/readme.md
new file mode 100644
index 00000000..baecde8b
--- /dev/null
+++ b/gan/transGAN/readme.md
@@ -0,0 +1,164 @@
+# TransGAN: Two Pure Transformers Can Make One Strong GAN, and That Can Scale Up, [arxiv](https://arxiv.org/abs/2102.07074) 
+
+PaddlePaddle training/validation code and pretrained models for **TransGAN**.
+
+The official pytorch implementation is [here](https://github.com/VITA-Group/TransGAN).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./assets/TransGAN_1.png" alt="drawing" width="90%" height="90%"/>
+<h4 align="center">TransGAN Model Overview</h4>
+</p>
+
+
+
+
+
+
+
+## Models Zoo
+| Model                          | FID | Image Size |  Link        |
+|--------------------------------|-----|------------|--------------|
+| transgan_cifar10            |9.31 |32 |[google](https://drive.google.com/file/d/10NXjIUAkBmhPNiqTCYJ4hg3SWMw9BxCM/view?usp=sharing)/[baidu](https://pan.baidu.com/s/16hi_kUZZOijNJNxocTiJXQ)(9vle)  |
+
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./transgan_cifar10.pdparams`, to use the `transgan_cifar10` model in python:
+```python
+from config import get_config
+from models.ViT_custom import Generator
+# config files in ./configs/
+config = get_config('./configs/transgan_cifar10.yaml')
+# build model
+model = Generator(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./transgan_cifar10')
+model.set_dict(model_state_dict)
+```
+
+## Generate Sample Images
+To generate sample images from pretrained models, download the pretrained weights, and run the following script using command line:
+```shell
+sh run_generate.sh
+```
+or 
+```shell
+python generate.py \
+  -cfg='./configs/transgan_cifar10.yaml' \
+  -num_out_images=16 \
+  -out_folder='./images_cifar10' \
+  -pretrained='./transgan_cifar10.pdparams'
+```
+The output images are stored in `-out_folder` path.
+
+
+## Evaluation
+To evaluate TransGAN model performance on Cifar10 with a single GPU, run the following script using command line:
+```shell
+sh run_eval_cifar.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg="./configs/transgan_cifar10.yaml" \
+  -dataset='cifar10' \
+  -batch_size=32 \
+  -eval \
+  -pretrained='./transgan_cifar10'
+```
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/transgan_cifar10.yaml' \
+    -dataset='cifar10' \
+    -batch_size=32 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./transgan_cifar10'
+```
+
+</details>
+
+
+
+## Training
+To train the TransGAN model on Cifar10 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg="./configs/transgan_cifar10.yaml" \
+  -dataset='cifar10' \
+  -batch_size=32 \
+```
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/transgan_cifar10.yaml' \
+    -dataset='cifar10' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+
+
+## Visualization of Generated Images
+### Generated Images after Training
+<img src="./assets/cifar_9_2.jpg" alt="drawing" width="40%" height="40%"/>
+<figcaption align = "center">Generated Images from Cifar10 datasets</figcaption>
+
+### Generated Images during Training 
+**(coming soon)**
+
+## Reference
+```
+@article{jiang2021transgan,
+  title={Transgan: Two transformers can make one strong gan},
+  author={Jiang, Yifan and Chang, Shiyu and Wang, Zhangyang},
+  journal={arXiv preprint arXiv:2102.07074},
+  year={2021}
+}
+```
diff --git a/gan/transGAN/run_eval_cifar.sh b/gan/transGAN/run_eval_cifar.sh
new file mode 100644
index 00000000..2fa5002b
--- /dev/null
+++ b/gan/transGAN/run_eval_cifar.sh
@@ -0,0 +1,7 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg="./configs/transgan_cifar10.yaml" \
+-dataset='cifar10' \
+-batch_size=64 \
+-eval \
+-pretrained='./transgan_cifar10'
diff --git a/gan/transGAN/run_eval_multi_cifar.sh b/gan/transGAN/run_eval_multi_cifar.sh
new file mode 100644
index 00000000..8b91bf2c
--- /dev/null
+++ b/gan/transGAN/run_eval_multi_cifar.sh
@@ -0,0 +1,7 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg="./configs/transgan_cifar10.yaml" \
+-dataset='cifar10' \
+-batch_size=64 \
+-eval \
+-pretrained='./transgan_cifar10'
diff --git a/gan/transGAN/run_generate.sh b/gan/transGAN/run_generate.sh
new file mode 100644
index 00000000..aaa1e7dc
--- /dev/null
+++ b/gan/transGAN/run_generate.sh
@@ -0,0 +1,5 @@
+python generate.py \
+    -cfg='transgan_cifar10.yaml' \
+    -num_out_images=16 \
+    -out_folder='./images_cifar10' \
+    -pretrained='transgan_cifar10.pdparams'
\ No newline at end of file
diff --git a/gan/transGAN/run_train.sh b/gan/transGAN/run_train.sh
new file mode 100644
index 00000000..7e668186
--- /dev/null
+++ b/gan/transGAN/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg="transgan_cifar10.yaml" \
+-dataset='cifar10' \
+-batch_size=32 \
+-pretrained='./transgan_cifar10'
\ No newline at end of file
diff --git a/gan/transGAN/stl10_dataset.py b/gan/transGAN/stl10_dataset.py
new file mode 100644
index 00000000..04828277
--- /dev/null
+++ b/gan/transGAN/stl10_dataset.py
@@ -0,0 +1,123 @@
+ # Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+
+"""
+STL-10 Dataset and related methods
+"""
+import os
+import numpy as np
+from PIL import Image
+from paddle.io import Dataset
+
+
+class STL10Dataset(Dataset):
+    """paddle dataset for loading STL-10 binary data
+    This class will load the binary file from STL-10 dataset,
+    extract and read images and labels. Images are stored in numpy array,
+    with shape: [num_images, 96,96,3]. Labels are store in numpy array, with
+    shape: [num_images].
+
+    Args:
+        file_folder: str, folder path of STL-10 dataset binary files
+        mode: str, dataset mode, choose from ['train', 'test'], default: 'train'
+        transform: paddle.vision.transforms, transforms which is applied on data, default: None
+    """
+    def __init__(self, file_folder, mode='train', transform=None):
+        super().__init__()
+        assert mode in ['train', 'test', 'unlabeled']
+        self.folder = file_folder
+        self.transform = transform
+        self.height = 96
+        self.width = 96
+        self.channels = 3
+        self.mode = mode
+        # num of bytes of a single image
+        self.image_bytes = self.height * self.width * self.channels
+        self.train_filepath = os.path.join(file_folder, f'{mode}_X.bin')
+        self.images = read_all_images(self.train_filepath)
+
+        if mode != 'unlabeled':
+            self.label_filepath = os.path.join(file_folder, f'{mode}_y.bin')
+            self.labels = read_labels(self.label_filepath)
+        else:
+            self.labels = np.zeros(self.__len__())
+
+        print(f'----- STL-10 dataset {mode} len = {self.labels.shape[0]}')
+
+    def __len__(self):
+        return self.images.shape[0]
+
+    def __getitem__(self, index):
+        data = self.images[index]
+        if self.transform is not None:
+            data = self.transform(data)
+        label = self.labels[index]
+        return data, label
+
+
+def read_labels(label_path):
+    """read data labels from binary file
+    Args:
+        label_path: label binary file path, e.g.,'train_y.bin'
+    Returns:
+        labels: np.array, the label array with shape [num_images]
+    """
+    with open(label_path, 'rb') as infile:
+        labels = np.fromfile(infile, dtype=np.uint8)
+    return labels
+
+
+def read_all_images(data_path):
+    """read all images from binary file
+    Args:
+        data_path: data binary file path, e.g.,'train_X.bin'
+    Returns:
+        images: np.array, the image array with shape [num_images, 96, 96, 3]
+    """
+    with open(data_path, 'rb') as infile:
+        # read whole data in unit8
+        data = np.fromfile(infile, dtype=np.uint8)
+        # images are stored in column major order
+        # 1st, 2nd, 3rd 96x96 are red, green, blue channels
+        images = np.reshape(data, (-1, 3, 96, 96))
+        # outputs are with shape [num_images, height, width, channels]
+        images = np.transpose(images, (0, 3, 2, 1))
+        return images
+
+
+def save_image(image, name):
+    img = Image.fromarray(image)
+    img.save(f"{name}.png")
+
+
+def save_images(images, labels, out_path):
+    for idx, image in enumerate(images):
+        out_path = os.path.join(out_path, str(labels[idx]))
+        os.makedirs(out_path, exist_ok=True)
+        save_image(image, os.path.join(out_path, str(idx)+'.png'))
+
+
+## NOTE: this is for test, can be removed later
+#if __name__ == "__main__":
+#    dataset = STL10Dataset(file_folder='./stl10_binary')
+#    print(dataset.labels.shape)
+#    for idx, (data, label) in enumerate(dataset):
+#        print(idx)
+#        print(data.shape)
+#        # save images to file
+#        save_image(data, f'{idx}.png')
+#        print(label)
+#        print('-----')
+#        if idx == 10:
+#            break
diff --git a/gan/transGAN/utils.py b/gan/transGAN/utils.py
new file mode 100644
index 00000000..c220ac8a
--- /dev/null
+++ b/gan/transGAN/utils.py
@@ -0,0 +1,368 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for transGAN
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+"""
+
+import math
+import pickle
+from scipy import special
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.distributed as dist
+from paddle.optimizer.lr import LRScheduler
+import paddle.nn.functional as F
+
+
+# Several initialization methods
+@paddle.no_grad()
+def constant_(x, value):
+    temp_value = paddle.full(x.shape, value, x.dtype)
+    x.set_value(temp_value)
+    return x
+
+
+@paddle.no_grad()
+def normal_(x, mean=0., std=1.):
+    temp_value = paddle.normal(mean, std, shape=x.shape)
+    x.set_value(temp_value)
+    return x
+
+
+@paddle.no_grad()
+def uniform_(x, a=-1., b=1.):
+    temp_value = paddle.uniform(min=a, max=b, shape=x.shape)
+    x.set_value(temp_value)
+    return x
+
+
+def gelu(x):
+    """ Original Implementation of the gelu activation function in Google Bert repo
+        when initialy created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly
+        different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    with paddle.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor = paddle.uniform(tensor.shape, min=(2 * l - 1), max=(2 * u - 1))
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor = paddle.to_tensor(special.erfinv(tensor.numpy()))
+
+        # Transform to proper mean, std
+        tensor = paddle.multiply(tensor, paddle.to_tensor(std * math.sqrt(2.)))
+        tensor = paddle.add(tensor, paddle.to_tensor(mean))
+
+        # Clamp to ensure it's in the proper range
+        tensor = paddle.clip(tensor, min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `paddle.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = paddle.empty(3, 5)
+        >>> trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
+
+
+def leakyrelu(x):
+    return nn.functional.leaky_relu(x, 0.2)
+
+
+def DiffAugment(x, policy='', channels_first=True, affine=None):
+    if policy:
+        if not channels_first:
+            x = x.transpose(0, 3, 1, 2)
+        for p in policy.split(','):
+            for f in AUGMENT_FNS[p]:
+                x = f(x, affine=affine)
+        if not channels_first:
+            x = x.transpose(0, 2, 3, 1)
+    return x
+
+
+# belong to DiffAugment
+def rand_brightness(x, affine=None):
+    x = x + (paddle.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) - 0.5)
+    return x
+
+
+# belong to DiffAugment
+def rand_saturation(x, affine=None):
+    x_mean = x.mean(dim=1, keepdim=True)
+    x = (x - x_mean) * (paddle.rand(
+                  x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) * 2) + x_mean
+    return x
+
+# belong to DiffAugment
+def rand_contrast(x, affine=None):
+    x_mean = x.mean(dim=[1, 2, 3], keepdim=True)
+    x = (x - x_mean) * (paddle.rand(
+                  x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) + 0.5) + x_mean
+    return x
+
+
+# belong to DiffAugment
+def rand_cutout(x, ratio=0.5, affine=None):
+    if random.random() < 0.3:
+        cutout_size = int(x.size(2) * ratio + 0.5), int(x.size(3) * ratio + 0.5)
+        offset_x =paddle.randint(0,
+                                 x.size(2) + (1 - cutout_size[0] % 2), size=[x.size(0), 1, 1],
+                                 device=x.device)
+        offset_y = paddle.randint(0,
+                                  x.size(3) + (1 - cutout_size[1] % 2), size=[x.size(0), 1, 1],
+                                  device=x.device)
+        grid_batch, grid_x, grid_y = paddle.meshgrid(
+            paddle.arange(x.size(0), dtype=paddle.long, device=x.device),
+            paddle.arange(cutout_size[0], dtype=paddle.long, device=x.device),
+            paddle.arange(cutout_size[1], dtype=paddle.long, device=x.device),
+        )
+        grid_x = paddle.clamp(grid_x + offset_x - cutout_size[0] // 2, min=0, max=x.size(2) - 1)
+        grid_y = paddle.clamp(grid_y + offset_y - cutout_size[1] // 2, min=0, max=x.size(3) - 1)
+        del offset_x
+        del offset_y
+        mask = paddle.ones(x.size(0), x.size(2), x.size(3), dtype=x.dtype, device=x.device)
+        mask[grid_batch, grid_x, grid_y] = 0
+        x = x * mask.unsqueeze(1)
+        del mask
+        del grid_x
+        del grid_y
+        del grid_batch
+    return x
+
+
+# belong to DiffAugment
+def rand_translation(x, ratio=0.2, affine=None):
+    shift_x, shift_y = int(x.shape[2] * ratio + 0.5), int(x.shape[3] * ratio + 0.5)
+    translation_x = paddle.randint(-shift_x, shift_x + 1, shape=[x.shape[0], 1, 1])
+    translation_y = paddle.randint(-shift_y, shift_y + 1, shape=[x.shape[0], 1, 1])
+    grid_batch, grid_x, grid_y = paddle.meshgrid(
+        paddle.arange(x.shape[0]),
+        paddle.arange(x.shape[2]),
+        paddle.arange(x.shape[3]),
+    )
+    grid_x = paddle.clip(grid_x + translation_x + 1, 0, x.shape[2] + 1)
+    grid_y = paddle.clip(grid_y + translation_y + 1, 0, x.shape[3] + 1)
+    x_pad = F.pad(x, [1, 1, 1, 1, 0, 0, 0, 0])
+    x = x_pad.transpose([0, 2, 3, 1])[grid_batch, grid_x, grid_y].transpose([0, 3, 1, 2])
+    return x
+
+
+AUGMENT_FNS = {
+    'color': [rand_brightness, rand_saturation, rand_contrast],
+    'translation': [rand_translation],
+    'cutout': [rand_cutout],
+}
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl author created for EfficientNet, etc networks,
+    however,the original name is misleading as 'Drop Connect' is a different form of dropout in
+    a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... 
+    author have opted for changing the layer and argument names to 'drop path' rather than mix
+    DropConnect as a layer name and use 'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+def pixel_upsample(x, H, W):
+    B, N, C = x.shape
+    assert N == H*W
+    x = x.transpose((0, 2, 1))
+    x = x.reshape((-1, C, H, W))
+    x = nn.PixelShuffle(2)(x)
+    B, C, H, W = x.shape
+    x = x.reshape((-1, C, H*W))
+    x = x.transpose((0, 2, 1))
+    return x, H, W
+
+
+def all_gather(data):
+    """ run all_gather on any picklable data (do not requires tensors)
+    Args:
+        data: picklable object
+    Returns:
+        data_list: list of data gathered from each rank
+    """
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return [data]
+
+    buffer = pickle.dumps(data) #write data into Bytes and stores in buffer
+    np_buffer = np.frombuffer(buffer, dtype=np.int8)
+    tensor = paddle.to_tensor(np_buffer, dtype='int32') # uint8 doese not have many ops in paddle
+
+    # obtain Tensor size of each rank
+    local_size = paddle.to_tensor([tensor.shape[0]])
+    size_list = []
+    dist.all_gather(size_list, local_size)
+    max_size = max(size_list)
+
+    # receiving tensors from all ranks,
+    # all_gather does not support different shape, so we use padding
+    tensor_list = []
+    if local_size != max_size:
+        padding = paddle.empty(shape=(max_size - local_size, ), dtype='int32')
+        tensor = paddle.concat((tensor, padding), axis=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.astype('uint8').cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
diff --git a/image_classification/CSwin/README.md b/image_classification/CSwin/README.md
new file mode 100644
index 00000000..a1363e7f
--- /dev/null
+++ b/image_classification/CSwin/README.md
@@ -0,0 +1,168 @@
+# CSWin Transformer: A General Vision Transformer Backbone with Cross-Shaped Windows, [arxiv](https://arxiv.org/pdf/2107.00652.pdf) 
+
+PaddlePaddle training/validation code and pretrained models for **CSWin Transformer**.
+
+The official pytorch implementation is [here](https://github.com/microsoft/CSWin-Transformer).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+
+<img src="./cswin1.png" alt="drawing" width="90%" height="90%"/>
+<img src="./cswin2.png" alt="drawing" width="90%" height="90%"/>
+    <h4 align="center">CSWin Transformer Model Overview</h4>
+</p>
+
+
+### Update 
+Update (2021-08-11): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| cswin_tiny_224  | 82.81  | 96.30 | 224        | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1l-JY0u7NGyD6SjkyiyNnDx3wFFT1nAYO/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1L5FqU7ImWAhQHAlSilqVAw)(4q3h) |
+| cswin_small_224 | 83.60  | 96.58 | 224        | 0.9      | bicubic       | [google](https://drive.google.com/file/d/10eEBk3wvJdQ8Dy58LvQ11Wk1K2UfPy-E/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1FiaNiWyAuWu1IBsUFLUaAw)(gt1a) |
+| cswin_base_224  | 84.23  | 96.91 | 224        | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1YufKh3DKol4-HrF-I22uiorXSZDIXJmZ/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1koy8hXyGwvgAfUxdlkWofg)(wj8p) |
+| cswin_large_224 | 86.52  | 97.99 | 224        | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1V1hteGK27t1nI84Ac7jdWfydBLLo7Fxt/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1KgIX6btML6kPiPGkIzvyVA)(b5fs) |
+| cswin_base_384  | 85.51  | 97.48 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1qCaFItzFoTYBo-4UbGzL6M5qVDGmJt4y/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1WNkY7o_vP9KJ8cd5c7n2sQ)(rkf5) |
+| cswin_large_384 | 87.49  | 98.35 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1LRN_6qUz71yP-OAOpN4Lscb8fkUytMic/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1eCIpegPj1HIbJccPMaAsew)(6235) |
+
+> *The results are evaluated on ImageNet2012 validation set.
+
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./cswin_base_224.pdparams`, to use the `cswin_base_224` model in python:
+```python
+from config import get_config
+from cswin import build_cswin as build_model
+# config files in ./configs/
+config = get_config('./configs/cswin_base_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./cswin_base_224')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate CSWin model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/cswin_base_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./cswin_base_224'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/cswin_base_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./cswin_base_224'
+```
+
+</details>
+
+## Training
+To train the CSWin model on ImageNet2012 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/cswin_base_224.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=32 \
+  -data_path='/dataset/imagenet' \
+```
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/cswin_base_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@article{dong2021cswin,
+  title={CSWin Transformer: A General Vision Transformer Backbone with Cross-Shaped Windows},
+  author={Dong, Xiaoyi and Bao, Jianmin and Chen, Dongdong and Zhang, Weiming and Yu, Nenghai and Yuan, Lu and Chen, Dong and Guo, Baining},
+  journal={arXiv preprint arXiv:2107.00652},
+  year={2021}
+}
+```
diff --git a/image_classification/CSwin/config.py b/image_classification/CSwin/config.py
new file mode 100644
index 00000000..2aeb03c8
--- /dev/null
+++ b/image_classification/CSwin/config.py
@@ -0,0 +1,164 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+"""
+
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 8 #1024 batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #1024 batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size
+_C.DATA.CROP_PCT = 0.9 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'CSwin'
+_C.MODEL.NAME = 'CSwin'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.1
+_C.MODEL.DROPPATH = 0.0
+_C.MODEL.ATTENTION_DROPOUT = 0.0
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.PATCH_SIZE = 4 # image_size = patch_size x window_size x num_windows
+_C.MODEL.TRANS.SPLIT_SIZES = [7]
+_C.MODEL.TRANS.IN_CHANNELS = 3
+_C.MODEL.TRANS.EMBED_DIM = 96 # same as HIDDEN_SIZE in ViT
+_C.MODEL.TRANS.DEPTHS = [2, 2, 6, 2]
+_C.MODEL.TRANS.NUM_HEADS = [3, 6, 12, 24]
+_C.MODEL.TRANS.MLP_RATIO = 4.
+_C.MODEL.TRANS.QKV_BIAS = True
+_C.MODEL.TRANS.QK_SCALE = None
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size
+_C.TRAIN.WEIGHT_DECAY = 0.05 #0.3 # 0.0 for finetune
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0
+_C.TRAIN.END_LR = 5e-4
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# augmentation
+_C.AUG = CN()
+_C.AUG.COLOR_JITTER = 0.4 # color jitter factor
+_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1'
+_C.AUG.RE_PROB = 0.25 # random earse prob
+_C.AUG.RE_MODE = 'pixel' # random earse mode
+_C.AUG.RE_COUNT = 1 # random earse count
+_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0
+_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0
+_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha
+_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled
+_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled
+_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem'
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 20 # freq to save chpt
+_C.REPORT_FREQ = 50 # freq to logging info
+_C.VALIDATE_FREQ = 20 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/CSwin/configs/cswin_base_224.yaml b/image_classification/CSwin/configs/cswin_base_224.yaml
new file mode 100644
index 00000000..0a3d906a
--- /dev/null
+++ b/image_classification/CSwin/configs/cswin_base_224.yaml
@@ -0,0 +1,12 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: cswin
+    NAME: cswin_base_224
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIM: 96
+        DEPTHS: [2, 4, 32, 2]
+        SPLIT_SIZES: [1, 2, 7, 7]
+        NUM_HEADS: [4, 8, 16, 32]
diff --git a/image_classification/CSwin/configs/cswin_base_384.yaml b/image_classification/CSwin/configs/cswin_base_384.yaml
new file mode 100644
index 00000000..bafa829b
--- /dev/null
+++ b/image_classification/CSwin/configs/cswin_base_384.yaml
@@ -0,0 +1,12 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: cswin
+    NAME: cswin_base_384
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIM: 96
+        DEPTHS: [2, 4, 32, 2]
+        SPLIT_SIZES: [1, 2, 12, 12]
+        NUM_HEADS: [4, 8, 16, 32]
diff --git a/image_classification/CSwin/configs/cswin_large_224.yaml b/image_classification/CSwin/configs/cswin_large_224.yaml
new file mode 100644
index 00000000..6f590528
--- /dev/null
+++ b/image_classification/CSwin/configs/cswin_large_224.yaml
@@ -0,0 +1,12 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: cswin
+    NAME: cswin_large_224
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIM: 144
+        DEPTHS: [2, 4, 32, 2]
+        SPLIT_SIZES: [1, 2, 7, 7]
+        NUM_HEADS: [6, 12, 24, 24]
diff --git a/image_classification/CSwin/configs/cswin_large_384.yaml b/image_classification/CSwin/configs/cswin_large_384.yaml
new file mode 100644
index 00000000..465ffe8b
--- /dev/null
+++ b/image_classification/CSwin/configs/cswin_large_384.yaml
@@ -0,0 +1,12 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: cswin
+    NAME: cswin_large_384
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIM: 144
+        DEPTHS: [2, 4, 32, 2]
+        SPLIT_SIZES: [1, 2, 12, 12]
+        NUM_HEADS: [6, 12, 24, 24]
diff --git a/image_classification/CSwin/configs/cswin_small_224.yaml b/image_classification/CSwin/configs/cswin_small_224.yaml
new file mode 100644
index 00000000..f5cf5ab1
--- /dev/null
+++ b/image_classification/CSwin/configs/cswin_small_224.yaml
@@ -0,0 +1,12 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: cswin
+    NAME: cswin_small_224
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIM: 64
+        DEPTHS: [2, 4, 32, 2]
+        SPLIT_SIZES: [1, 2, 7, 7]
+        NUM_HEADS: [2, 4, 8, 16]
diff --git a/image_classification/CSwin/configs/cswin_tiny_224.yaml b/image_classification/CSwin/configs/cswin_tiny_224.yaml
new file mode 100644
index 00000000..77f643b9
--- /dev/null
+++ b/image_classification/CSwin/configs/cswin_tiny_224.yaml
@@ -0,0 +1,12 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: cswin
+    NAME: cswin_tiny_224
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIM: 64
+        DEPTHS: [1, 2, 21, 1]
+        SPLIT_SIZES: [1, 2, 7, 7]
+        NUM_HEADS: [2, 4, 8, 16]
diff --git a/image_classification/CSwin/cswin.py b/image_classification/CSwin/cswin.py
new file mode 100644
index 00000000..86c40d30
--- /dev/null
+++ b/image_classification/CSwin/cswin.py
@@ -0,0 +1,519 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement Transformer Class for CSwin
+"""
+
+import copy
+import numpy as np
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+
+    The output of this layer is the input without any change.
+    Use this layer to avoid if condition in some forward methods
+
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """CSwin Patch Embedding
+    This patch embedding has a 7x7 conv + layernorm, the output tensor
+    is reshaped to [Batch, H*W, embed_dim]. Note that the patch is applied
+    by a conv with overlap (using patch_stride).
+
+    Args:
+        patch_stride: int, patch stride size, default: 4
+        in_channels: int, number of channels of input image, default: 3
+        embed_dim: int, output feature dimension, default: 96
+    """
+    def __init__(self, patch_stride=4, in_channels=3, embed_dim=96):
+        super().__init__()
+        self.patch_embed = nn.Conv2D(in_channels=in_channels,
+                                     out_channels=embed_dim,
+                                     kernel_size=7,
+                                     stride=patch_stride,
+                                     padding=2)
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        x = self.patch_embed(x) # [batch, embed_dim, h, w], h = w = image_size / 4
+        x = x.flatten(start_axis=2, stop_axis=-1) # [batch, embed_dim, h*w]
+        x = x.transpose([0, 2, 1]) # [batch, h*w, embed_dim]
+        x = self.norm(x)
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+    def __init__(self, in_features, hidden_features, dropout):
+        super().__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             in_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+def img2windows(img, h_split, w_split):
+    """Convert input tensor into split stripes
+
+    Args:
+        img: tensor, image tensor with shape [B, C, H, W]
+        h_split: int, splits width in height direction
+        w_split: int, splits width in width direction
+    Returns:
+        out: tensor, splitted image
+    """
+    B, C, H, W = img.shape
+    out = img.reshape([B, C, H // h_split, h_split, W // w_split, w_split])
+    out = out.transpose([0, 2, 4, 3, 5, 1]) # [B, H//h_split, W//w_split, h_split, w_split, C]
+    out = out.reshape([-1, h_split * w_split, C]) # [B, H//h_split, W//w_split, h_split*w_split, C]
+    return out
+
+
+def windows2img(img_splits, h_split, w_split, img_h, img_w):
+    """Convert splitted stripes back
+
+    Args:
+        img_splits: tensor, image tensor with shape [B, C, H, W]
+        h_split: int, splits width in height direction
+        w_split: int, splits width in width direction
+        img_h: int, original tensor height
+        img_w: int, original tensor width
+    Returns:
+        img: tensor, original tensor
+    """
+    B = int(img_splits.shape[0] / (img_h / h_split * img_w / w_split))
+    img = img_splits.reshape([B, img_h // h_split, img_w // w_split, h_split, w_split, -1])
+    img = img.transpose([0, 1, 3, 2, 4, 5]) #[B,img_h//h_split, h_split, img_w//w_split, w_split,C]
+    img = img.reshape([B, img_h, img_w, -1]) # [B, img_h, img_w, C]
+    return img
+
+
+class LePEAttention(nn.Layer):
+    """Cross Shaped Window self-attention with Locally enhanced positional encoding"""
+    def __init__(self,
+                 dim,
+                 resolution,
+                 h_split=7,
+                 w_split=7,
+                 num_heads=8,
+                 attention_dropout=0.,
+                 dropout=0.,
+                 qk_scale=None):
+        super().__init__()
+        self.dim = dim
+        self.resolution = resolution
+        self.num_heads = num_heads
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head ** -0.5
+        self.h_split = h_split
+        self.w_split = w_split
+
+        self.get_v = nn.Conv2D(in_channels=dim,
+                               out_channels=dim,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1,
+                               groups=dim)
+
+        self.softmax = nn.Softmax(axis=-1)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+
+    def im2cswin(self, x):
+        B, HW, C = x.shape
+        H = W = int(np.sqrt(HW))
+        x = x.transpose([0, 2, 1]) # [B, C, H*W]
+        x = x.reshape([B, C, H, W]) # [B, C, H, W]
+        x = img2windows(x, self.h_split, self.w_split)
+        x = x.reshape([-1, self.h_split * self.w_split, self.num_heads, self.dim_head])
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def get_lepe(self, x, func):
+        """Locally Enhanced Positional Encoding (LePE)
+        This module applies a depthwise conv on V and returns the lepe
+        Args:
+            x: tensor, the input tensor V
+            func: nn.Layer, a depth wise conv of kernel 3 stride 1 and padding 1
+        """
+        B, HW, C = x.shape
+        H = W = int(np.sqrt(HW))
+        h_split = self.h_split
+        w_split = self.w_split
+
+        x = x.transpose([0, 2, 1]) # [B, C, H*W]
+        x = x.reshape([B, C, H, W]) # [B, C, H, W]
+        x = x.reshape([B, C, H // h_split, h_split, W // w_split, w_split])
+        x = x.transpose([0, 2, 4, 1, 3, 5]) # [B, H//h_split, W//w_split, C, h_split, w_split]
+        x = x.reshape([-1, C, h_split, w_split]) # [B*(H//h_split)*(W//w_split), h_split, w_split]
+
+        lepe = func(x) # depth wise conv does not change shape
+        #lepe = lepe.reshape([-1, self.num_heads, C // self.num_heads, h_split * w_split])
+        lepe = lepe.reshape([-1, self.num_heads, self.dim_head, h_split * w_split])
+        lepe = lepe.transpose([0, 1, 3, 2]) # [B, num_heads, h_spllit*w_split, dim_head]
+
+        x = x.reshape([-1, self.num_heads, self.dim_head, h_split * w_split])
+        x = x.transpose([0, 1, 3, 2]) # [B, num_heads, h_split*wsplit, dim_head]
+        return x, lepe
+
+    def forward(self, q, k, v):
+        B, HW, C = q.shape
+        H = W = self.resolution
+        q = self.im2cswin(q)
+        k = self.im2cswin(k)
+        v, lepe = self.get_lepe(v, self.get_v)
+
+        q = q * self.scale
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z + lepe
+        z = z.transpose([0, 2, 1, 3])
+        z = z.reshape([-1, self.h_split * self.w_split, C])
+
+        z = windows2img(z, self.h_split, self.w_split, H, W)
+        z = z.reshape([B, -1, C])
+        return z
+
+
+class CSwinBlock(nn.Layer):
+    """CSwin Block
+
+    CSwin block contains a LePE attention modual, a linear projection,
+    a mlp layer, and related norms layers. In the first 3 stages, the
+    LePE attention moduals used 2 branches, where horizontal and
+    vertical split stripes are used for self attention and a concat
+    op is applied to combine the outputs. The last stage does not
+    have branche in LePE attention.
+
+    Args:
+        dim: int, input feature dimension
+        input_resolution: int, input feature spatial size.
+        num_heads: int, num of attention heads in current stage
+        split_size: int, the split size in current stage
+        mlp_ratio: float, mlp ratio, mlp_hidden_dim = mlp_ratio * mlp_in_dim, default: 4.
+        qkv_bias: bool, if set True, qkv projection will have bias, default: True
+        qk_scale: float, if set, replace the orig qk_scale (dim_head ** -0.5), default: None
+        dropout: float, dropout rate for linear projection, default: 0
+        attention_dropout: float, dropout rate for attention, default: 0
+        droppath: float, drop path rate, default: 0
+        split_heads: bool, if True, split heads is applied (True for 1,2,3 stages), default: True
+    """
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 split_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.,
+                 droppath=0.,
+                 split_heads=True):
+        super().__init__()
+        self.dim = dim
+        # NOTE: here assume image_h == imgae_w
+        self.input_resolution = (input_resolution, input_resolution)
+        self.num_heads = num_heads
+        self.dim_head = dim // num_heads
+        self.mlp_ratio = mlp_ratio
+        self.split_size = split_size
+        self.norm1 = nn.LayerNorm(dim)
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attns = nn.LayerList()
+        self.split_heads = split_heads
+
+        num_branches = 2 if split_heads else 1
+        if split_heads: # first 3 stages
+            splits = [self.input_resolution[0], self.split_size] # horizantal splits
+        else: # last stage
+            splits = [self.input_resolution[0], self.input_resolution[0]]
+        for _ in range(num_branches):
+            attn = LePEAttention(dim=dim//num_branches,
+                                 resolution=input_resolution,
+                                 h_split=splits[0],
+                                 w_split=splits[1],
+                                 num_heads=num_heads//num_branches,
+                                 qk_scale=qk_scale,
+                                 attention_dropout=attention_dropout,
+                                 dropout=dropout)
+            self.attns.append(copy.deepcopy(attn))
+            # switch splits from horizantal to vertical
+            # NOTE: may need to change for different H and W
+            splits[0], splits[1] = splits[1], splits[0]
+
+        self.proj = nn.Linear(dim, dim)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       dropout=dropout)
+
+    def chunk_qkv(self, x, chunks=1, axis=-1):
+        x = x.chunk(chunks, axis=axis)
+        return x
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, HW, C = x.shape
+        # cswin attention
+        h = x
+        x = self.norm1(x)
+        qkv = self.qkv(x).chunk(3, axis=-1) # qkv is a tuple of [q, k, v]
+        chunks = 2 if self.split_heads else 1
+        # qkv[0].shape = [B, H * W, embd_dim]
+        q, k, v = map(self.chunk_qkv, qkv, (chunks,) * 3) # map requries list/tuple inputs
+        if self.split_heads: # first 3 stages
+            h_attn = self.attns[0](q[0], k[0], v[0])
+            w_attn = self.attns[1](q[1], k[1], v[1])
+            attn = paddle.concat([h_attn, w_attn], axis=2)
+        else: # last stage
+            attn = self.attns[0](q[0], k[0], v[0])
+        attn = self.proj(attn)
+        attn = self.drop_path(attn)
+        x = h + attn
+        # mlp + residual
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = h + x
+        return x
+
+
+class MergeBlock(nn.Layer):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.conv = nn.Conv2D(in_channels=dim_in,
+                              out_channels=dim_out,
+                              kernel_size=3,
+                              stride=2,
+                              padding=1)
+        self.norm = nn.LayerNorm(dim_out)
+
+    def forward(self, x):
+        B, HW, C = x.shape
+        H = W = int(np.sqrt(HW))
+        x = x.transpose([0, 2, 1]) # [B, C, HW]
+        x = x.reshape([B, C, H, W]) # [B, C, H, W]
+        x = self.conv(x)
+        new_shape = [x.shape[0], x.shape[1], -1] # [B, C', H*W]
+        x = x.reshape(new_shape) # [B, C', H*W]
+        x = x.transpose([0, 2, 1]) # [B, H*W, C']
+        x = self.norm(x)
+        return x
+
+
+class CSwinStage(nn.Layer):
+    """ CSwin Stage, each stage contains multi blocks
+
+    CSwin has 4 stages, the first 3 stages are using head split. The last
+    stage does not have head split. There is a merge block between each
+    2 stages.
+
+    Args:
+        dim: int, input feature dimension
+        depth: int, number of blocks in current stage
+        num_heads: int, num of attention heads in current stage
+        split_size: int, the split size in current stage
+        mlp_ratio: float, mlp ratio, mlp_hidden_dim = mlp_ratio * mlp_in_dim, default: 4.
+        qkv_bias: bool, if set True, qkv projection will have bias, default: True
+        qk_scale: float, if set, replace the orig qk_scale (dim_head ** -0.5), default: None
+        dropout: float, dropout rate for linear projection, default: 0
+        attention_dropout: float, dropout rate for attention, default: 0
+        droppath: float, drop path rate, default: 0
+        last_stage: bool, if current stage is the last stage, default: False
+    """
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 split_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.,
+                 last_stage=False):
+        super().__init__()
+        self.blocks = nn.LayerList()
+        for i in range(depth):
+            block = CSwinBlock(dim=dim,
+                               input_resolution=input_resolution,
+                               num_heads=num_heads,
+                               split_size=split_size,
+                               mlp_ratio=mlp_ratio,
+                               qkv_bias=qkv_bias,
+                               qk_scale=qk_scale,
+                               attention_dropout=attention_dropout,
+                               dropout=dropout,
+                               droppath=droppath[i] if isinstance(droppath, list) else droppath,
+                               split_heads=not last_stage)
+            self.blocks.append(copy.deepcopy(block))
+        # last stage does not need merge layer
+        self.merge = MergeBlock(dim_in=dim, dim_out=dim * 2) if not last_stage else Identity()
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        x = self.merge(x)
+        return x
+
+
+class CSwinTransformer(nn.Layer):
+    """CSwin Transformer class
+    Args:
+        image_size: int, input image size, default: 224
+        patch_stride: int, stride for patch embedding, default: 4
+        in_channels: int, num of channels of input image, default: 3
+        num_classes: int, num of classes, default: 1000
+        embed_dim: int, embedding dim (patch embed out dim), default: 96
+        depths: list/tuple(int), number of blocks in each stage, default: [2, 4, 32, 2]
+        splits: list/tuple(int), the split number in each stage, default: [1, 2, 7, 7]
+        num_heads: list/tuple(int), num of attention heads in each stage, default: [4, 8, 16, 32]
+        mlp_ratio: float, mlp ratio, mlp_hidden_dim = mlp_ratio * mlp_in_dim, default: 4.
+        qkv_bias: bool, if set True, qkv projection will have bias, default: True
+        qk_scale: float, if set, replace the orig qk_scale (dim_head ** -0.5), default: None
+        dropout: float, dropout rate for linear projection, default: 0
+        attention_dropout: float, dropout rate for attention, default: 0
+        droppath: float, drop path rate, default: 0
+    """
+    def __init__(self,
+                 image_size=224,
+                 patch_stride=4,
+                 in_channels=3,
+                 num_classes=1000,
+                 embed_dim=96,
+                 depths=[2, 4, 32, 2],
+                 splits=[1, 2, 7, 7],
+                 num_heads=[4, 8, 16, 32],
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        # token embedding
+        self.patch_embedding = PatchEmbedding(patch_stride=patch_stride,
+                                              in_channels=in_channels,
+                                              embed_dim=embed_dim)
+        # drop path decay by stage
+        depth_decay = [x.item() for x in paddle.linspace(0, droppath, sum(depths))]
+        dim = embed_dim
+        resolution = image_size // 4
+        self.stages = nn.LayerList()
+        num_stages = len(depths)
+        # construct CSwin stages: each stage has multiple blocks
+        for stage_idx in range(num_stages):
+            stage = CSwinStage(dim=dim,
+                               input_resolution=resolution,
+                               depth=depths[stage_idx],
+                               num_heads=num_heads[stage_idx],
+                               split_size=splits[stage_idx],
+                               mlp_ratio=mlp_ratio,
+                               qkv_bias=qkv_bias,
+                               qk_scale=qk_scale,
+                               dropout=dropout,
+                               attention_dropout=attention_dropout,
+                               droppath=depth_decay[
+                                   sum(depths[:stage_idx]):sum(depths[:stage_idx+1])],
+                               last_stage=stage_idx == num_stages-1)
+            self.stages.append(stage)
+            if stage_idx != num_stages - 1:
+                dim = dim * 2
+                resolution = resolution // 2
+        # last norm and classification head layers
+        self.norm = nn.LayerNorm(dim)
+        self.head = nn.Linear(dim, num_classes)
+
+    def forward_features(self, x):
+        x = self.patch_embedding(x)
+        for stage in self.stages:
+            x = stage(x)
+        x = self.norm(x)
+        return paddle.mean(x, axis=1)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def build_cswin(config):
+    """build cswin transformer model using config"""
+    model = CSwinTransformer(image_size=config.DATA.IMAGE_SIZE,
+                             patch_stride=config.MODEL.TRANS.PATCH_SIZE,
+                             in_channels=config.MODEL.TRANS.IN_CHANNELS,
+                             num_classes=config.MODEL.NUM_CLASSES,
+                             embed_dim=config.MODEL.TRANS.EMBED_DIM,
+                             depths=config.MODEL.TRANS.DEPTHS,
+                             splits=config.MODEL.TRANS.SPLIT_SIZES,
+                             num_heads=config.MODEL.TRANS.NUM_HEADS,
+                             mlp_ratio=config.MODEL.TRANS.MLP_RATIO,
+                             qkv_bias=config.MODEL.TRANS.QKV_BIAS,
+                             qk_scale=config.MODEL.TRANS.QK_SCALE,
+                             dropout=config.MODEL.DROPOUT,
+                             attention_dropout=config.MODEL.ATTENTION_DROPOUT,
+                             droppath=config.MODEL.DROP_PATH)
+    return model
diff --git a/image_classification/CSwin/cswin1.png b/image_classification/CSwin/cswin1.png
new file mode 100644
index 00000000..b457d753
Binary files /dev/null and b/image_classification/CSwin/cswin1.png differ
diff --git a/image_classification/CSwin/cswin2.png b/image_classification/CSwin/cswin2.png
new file mode 100644
index 00000000..adf3978a
Binary files /dev/null and b/image_classification/CSwin/cswin2.png differ
diff --git a/image_classification/CSwin/datasets.py b/image_classification/CSwin/datasets.py
new file mode 100644
index 00000000..eeb16f89
--- /dev/null
+++ b/image_classification/CSwin/datasets.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from PIL import Image
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = Image.open(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        # scale_size must be single int, which will resize the shorter side of image
+        transforms.Resize(scale_size, 'bicubic'),
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/CSwin/droppath.py b/image_classification/CSwin/droppath.py
new file mode 100644
index 00000000..72b012d0
--- /dev/null
+++ b/image_classification/CSwin/droppath.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def drop_path(self, inputs):
+        """drop path op
+        Args:
+            input: tensor with arbitrary shape
+            drop_prob: float number of drop path probability, default: 0.0
+            training: bool, set if current mode is training, default: False
+        Returns:
+            output: output tensor after drop path
+        """
+        # if prob is 0 or eval mode, return original input
+        if self.drop_prob == 0. or not self.training:
+            return inputs
+        keep_prob = 1 - self.drop_prob
+        keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+        random_tensor = random_tensor.floor() # mask
+        output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation
+        return output
+
+    def forward(self, inputs):
+        return self.drop_path(inputs)
+
+
+#def main():
+#    tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32')
+#    dp = DropPath(0.5)
+#    for i in range(100):
+#        out = dp(tmp)
+#        print(out)
+#
+#if __name__ == "__main__":
+#    main()
diff --git a/image_classification/CSwin/main_multi_gpu.py b/image_classification/CSwin/main_multi_gpu.py
new file mode 100644
index 00000000..77c94fd0
--- /dev/null
+++ b/image_classification/CSwin/main_multi_gpu.py
@@ -0,0 +1,362 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""CSwin Transformer training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader, get_dataset
+from cswin import build_cswin as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('CSwin Transformer')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CSwin/main_single_gpu.py b/image_classification/CSwin/main_single_gpu.py
new file mode 100644
index 00000000..2662c0e5
--- /dev/null
+++ b/image_classification/CSwin/main_single_gpu.py
@@ -0,0 +1,333 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""CSwin Transformer training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from cswin import build_cswin as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('CSwin Transformer')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+    # 1. Create model
+    model = build_model(config)
+    #model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams')
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt')
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_dict(opt_state)
+        logger.info(
+            f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  )
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CSwin/port_weights/load_pytorch_weights_base_224.py b/image_classification/CSwin/port_weights/load_pytorch_weights_base_224.py
new file mode 100644
index 00000000..512f2fb1
--- /dev/null
+++ b/image_classification/CSwin/port_weights/load_pytorch_weights_base_224.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from cswin import *
+from cswin_pytorch.CSWin_Transformer.models import cswin as pytorch_cswin
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/cswin_base_224.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    print('-----------------MODEL NAMED PARAMETERS------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def print_model_named_buffers(model):
+    print('-----------------MODEL NAMED BUFFERS------------------------')
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stage1_conv_embed.0', 'patch_embedding.patch_embed'),
+        ('stage1_conv_embed.2', 'patch_embedding.norm'),
+    ]
+
+    for stage_idx, stage_depth in enumerate(config.MODEL.TRANS.DEPTHS):
+        for idx in range(stage_depth):
+            th_prefix = f'stage{stage_idx+1}.{idx}'
+            pp_prefix = f'stages.{stage_idx}.blocks.{idx}'
+
+            layer_mapping = [
+                (f'{th_prefix}.qkv', f'{pp_prefix}.qkv'),
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+                (f'{th_prefix}.attns.0.get_v', f'{pp_prefix}.attns.0.get_v'),
+                (f'{th_prefix}.attns.1.get_v', f'{pp_prefix}.attns.1.get_v'), # may not exist, ok
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+                (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            ]
+            mapping.extend(layer_mapping)
+        # prefix for last stage may not exist, it is ok in this code
+        th_prefix = f'merge{stage_idx+1}'
+        pp_prefix = f'stages.{stage_idx}.merge'
+        layer_mapping = [
+            (f'{th_prefix}.conv', f'{pp_prefix}.conv'),
+            (f'{th_prefix}.norm', f'{pp_prefix}.norm'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            if f'{th_name}.weight' in th_params.keys():
+                th_name_w = f'{th_name}.weight'
+                pd_name_w = f'{pd_name}.weight'
+                _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_cswin(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = pytorch_cswin.CSWin_96_24322_base_224()
+    model_state_dict = torch.load('./cswin_pytorch/cswin_base_224.pth')
+    torch_model.load_state_dict(model_state_dict['state_dict_ema'])
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./cswin_base_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CSwin/port_weights/load_pytorch_weights_base_384.py b/image_classification/CSwin/port_weights/load_pytorch_weights_base_384.py
new file mode 100644
index 00000000..93b113ad
--- /dev/null
+++ b/image_classification/CSwin/port_weights/load_pytorch_weights_base_384.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from cswin import *
+from cswin_pytorch.CSWin_Transformer.models import cswin as pytorch_cswin
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/cswin_base_384.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    print('-----------------MODEL NAMED PARAMETERS------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def print_model_named_buffers(model):
+    print('-----------------MODEL NAMED BUFFERS------------------------')
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stage1_conv_embed.0', 'patch_embedding.patch_embed'),
+        ('stage1_conv_embed.2', 'patch_embedding.norm'),
+    ]
+
+    for stage_idx, stage_depth in enumerate(config.MODEL.TRANS.DEPTHS):
+        for idx in range(stage_depth):
+            th_prefix = f'stage{stage_idx+1}.{idx}'
+            pp_prefix = f'stages.{stage_idx}.blocks.{idx}'
+
+            layer_mapping = [
+                (f'{th_prefix}.qkv', f'{pp_prefix}.qkv'),
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+                (f'{th_prefix}.attns.0.get_v', f'{pp_prefix}.attns.0.get_v'),
+                (f'{th_prefix}.attns.1.get_v', f'{pp_prefix}.attns.1.get_v'), # may not exist, ok
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+                (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            ]
+            mapping.extend(layer_mapping)
+        # prefix for last stage may not exist, it is ok in this code
+        th_prefix = f'merge{stage_idx+1}'
+        pp_prefix = f'stages.{stage_idx}.merge'
+        layer_mapping = [
+            (f'{th_prefix}.conv', f'{pp_prefix}.conv'),
+            (f'{th_prefix}.norm', f'{pp_prefix}.norm'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            if f'{th_name}.weight' in th_params.keys():
+                th_name_w = f'{th_name}.weight'
+                pd_name_w = f'{pd_name}.weight'
+                _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_cswin(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = pytorch_cswin.CSWin_96_24322_base_384(img_size=384)
+    model_state_dict = torch.load('./cswin_pytorch/cswin_base_384.pth')
+    torch_model.load_state_dict(model_state_dict['state_dict_ema'])
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 384, 384).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./cswin_base_384.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CSwin/port_weights/load_pytorch_weights_large_224.py b/image_classification/CSwin/port_weights/load_pytorch_weights_large_224.py
new file mode 100644
index 00000000..998c1eb8
--- /dev/null
+++ b/image_classification/CSwin/port_weights/load_pytorch_weights_large_224.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from cswin import *
+from cswin_pytorch.CSWin_Transformer.models import cswin as pytorch_cswin
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/cswin_large_224.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    print('-----------------MODEL NAMED PARAMETERS------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def print_model_named_buffers(model):
+    print('-----------------MODEL NAMED BUFFERS------------------------')
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stage1_conv_embed.0', 'patch_embedding.patch_embed'),
+        ('stage1_conv_embed.2', 'patch_embedding.norm'),
+    ]
+
+    for stage_idx, stage_depth in enumerate(config.MODEL.TRANS.DEPTHS):
+        for idx in range(stage_depth):
+            th_prefix = f'stage{stage_idx+1}.{idx}'
+            pp_prefix = f'stages.{stage_idx}.blocks.{idx}'
+
+            layer_mapping = [
+                (f'{th_prefix}.qkv', f'{pp_prefix}.qkv'),
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+                (f'{th_prefix}.attns.0.get_v', f'{pp_prefix}.attns.0.get_v'),
+                (f'{th_prefix}.attns.1.get_v', f'{pp_prefix}.attns.1.get_v'), # may not exist, ok
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+                (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            ]
+            mapping.extend(layer_mapping)
+        # prefix for last stage may not exist, it is ok in this code
+        th_prefix = f'merge{stage_idx+1}'
+        pp_prefix = f'stages.{stage_idx}.merge'
+        layer_mapping = [
+            (f'{th_prefix}.conv', f'{pp_prefix}.conv'),
+            (f'{th_prefix}.norm', f'{pp_prefix}.norm'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            if f'{th_name}.weight' in th_params.keys():
+                th_name_w = f'{th_name}.weight'
+                pd_name_w = f'{pd_name}.weight'
+                _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_cswin(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = pytorch_cswin.CSWin_144_24322_large_224()
+    model_state_dict = torch.load('./cswin_pytorch/cswin_large_224.pth')
+    torch_model.load_state_dict(model_state_dict['state_dict_ema'])
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./cswin_large_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CSwin/port_weights/load_pytorch_weights_large_384.py b/image_classification/CSwin/port_weights/load_pytorch_weights_large_384.py
new file mode 100644
index 00000000..39d1b6fe
--- /dev/null
+++ b/image_classification/CSwin/port_weights/load_pytorch_weights_large_384.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from cswin import *
+from cswin_pytorch.CSWin_Transformer.models import cswin as pytorch_cswin
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/cswin_large_384.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    print('-----------------MODEL NAMED PARAMETERS------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def print_model_named_buffers(model):
+    print('-----------------MODEL NAMED BUFFERS------------------------')
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stage1_conv_embed.0', 'patch_embedding.patch_embed'),
+        ('stage1_conv_embed.2', 'patch_embedding.norm'),
+    ]
+
+    for stage_idx, stage_depth in enumerate(config.MODEL.TRANS.DEPTHS):
+        for idx in range(stage_depth):
+            th_prefix = f'stage{stage_idx+1}.{idx}'
+            pp_prefix = f'stages.{stage_idx}.blocks.{idx}'
+
+            layer_mapping = [
+                (f'{th_prefix}.qkv', f'{pp_prefix}.qkv'),
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+                (f'{th_prefix}.attns.0.get_v', f'{pp_prefix}.attns.0.get_v'),
+                (f'{th_prefix}.attns.1.get_v', f'{pp_prefix}.attns.1.get_v'), # may not exist, ok
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+                (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            ]
+            mapping.extend(layer_mapping)
+        # prefix for last stage may not exist, it is ok in this code
+        th_prefix = f'merge{stage_idx+1}'
+        pp_prefix = f'stages.{stage_idx}.merge'
+        layer_mapping = [
+            (f'{th_prefix}.conv', f'{pp_prefix}.conv'),
+            (f'{th_prefix}.norm', f'{pp_prefix}.norm'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            if f'{th_name}.weight' in th_params.keys():
+                th_name_w = f'{th_name}.weight'
+                pd_name_w = f'{pd_name}.weight'
+                _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_cswin(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = pytorch_cswin.CSWin_144_24322_large_384(img_size=384)
+    model_state_dict = torch.load('./cswin_pytorch/cswin_large_384.pth')
+    torch_model.load_state_dict(model_state_dict['state_dict_ema'])
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 384, 384).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./cswin_large_384.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CSwin/port_weights/load_pytorch_weights_small_224.py b/image_classification/CSwin/port_weights/load_pytorch_weights_small_224.py
new file mode 100644
index 00000000..a08b5bf4
--- /dev/null
+++ b/image_classification/CSwin/port_weights/load_pytorch_weights_small_224.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from cswin import *
+from cswin_pytorch.CSWin_Transformer.models import cswin as pytorch_cswin
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/cswin_small_224.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    print('-----------------MODEL NAMED PARAMETERS------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def print_model_named_buffers(model):
+    print('-----------------MODEL NAMED BUFFERS------------------------')
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stage1_conv_embed.0', 'patch_embedding.patch_embed'),
+        ('stage1_conv_embed.2', 'patch_embedding.norm'),
+    ]
+
+    for stage_idx, stage_depth in enumerate(config.MODEL.TRANS.DEPTHS):
+        for idx in range(stage_depth):
+            th_prefix = f'stage{stage_idx+1}.{idx}'
+            pp_prefix = f'stages.{stage_idx}.blocks.{idx}'
+
+            layer_mapping = [
+                (f'{th_prefix}.qkv', f'{pp_prefix}.qkv'),
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+                (f'{th_prefix}.attns.0.get_v', f'{pp_prefix}.attns.0.get_v'),
+                (f'{th_prefix}.attns.1.get_v', f'{pp_prefix}.attns.1.get_v'), # may not exist, ok
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+                (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            ]
+            mapping.extend(layer_mapping)
+        # prefix for last stage may not exist, it is ok in this code
+        th_prefix = f'merge{stage_idx+1}'
+        pp_prefix = f'stages.{stage_idx}.merge'
+        layer_mapping = [
+            (f'{th_prefix}.conv', f'{pp_prefix}.conv'),
+            (f'{th_prefix}.norm', f'{pp_prefix}.norm'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            if f'{th_name}.weight' in th_params.keys():
+                th_name_w = f'{th_name}.weight'
+                pd_name_w = f'{pd_name}.weight'
+                _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_cswin(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = pytorch_cswin.CSWin_64_24322_small_224()
+    model_state_dict = torch.load('./cswin_pytorch/cswin_small_224.pth')
+    torch_model.load_state_dict(model_state_dict['state_dict_ema'])
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./cswin_small_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CSwin/port_weights/load_pytorch_weights_tiny_224.py b/image_classification/CSwin/port_weights/load_pytorch_weights_tiny_224.py
new file mode 100644
index 00000000..d304572a
--- /dev/null
+++ b/image_classification/CSwin/port_weights/load_pytorch_weights_tiny_224.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from cswin import *
+from cswin_pytorch.CSWin_Transformer.models import cswin as pytorch_cswin
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/cswin_tiny_224.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    print('-----------------MODEL NAMED PARAMETERS------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def print_model_named_buffers(model):
+    print('-----------------MODEL NAMED BUFFERS------------------------')
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stage1_conv_embed.0', 'patch_embedding.patch_embed'),
+        ('stage1_conv_embed.2', 'patch_embedding.norm'),
+    ]
+
+    for stage_idx, stage_depth in enumerate(config.MODEL.TRANS.DEPTHS):
+        for idx in range(stage_depth):
+            th_prefix = f'stage{stage_idx+1}.{idx}'
+            pp_prefix = f'stages.{stage_idx}.blocks.{idx}'
+
+            layer_mapping = [
+                (f'{th_prefix}.qkv', f'{pp_prefix}.qkv'),
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+                (f'{th_prefix}.attns.0.get_v', f'{pp_prefix}.attns.0.get_v'),
+                (f'{th_prefix}.attns.1.get_v', f'{pp_prefix}.attns.1.get_v'), # may not exist, ok
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+                (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            ]
+            mapping.extend(layer_mapping)
+        # prefix for last stage may not exist, it is ok in this code
+        th_prefix = f'merge{stage_idx+1}'
+        pp_prefix = f'stages.{stage_idx}.merge'
+        layer_mapping = [
+            (f'{th_prefix}.conv', f'{pp_prefix}.conv'),
+            (f'{th_prefix}.norm', f'{pp_prefix}.norm'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            if f'{th_name}.weight' in th_params.keys():
+                th_name_w = f'{th_name}.weight'
+                pd_name_w = f'{pd_name}.weight'
+                _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_cswin(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = pytorch_cswin.CSWin_64_12211_tiny_224()
+    model_state_dict = torch.load('./cswin_pytorch/cswin_tiny_224.pth')
+    torch_model.load_state_dict(model_state_dict['state_dict_ema'])
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./cswin_tiny_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CSwin/run_eval_multi_base_224.sh b/image_classification/CSwin/run_eval_multi_base_224.sh
new file mode 100644
index 00000000..932203de
--- /dev/null
+++ b/image_classification/CSwin/run_eval_multi_base_224.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/cswin_base_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cswin_base_224' \
diff --git a/image_classification/CSwin/run_eval_multi_base_384.sh b/image_classification/CSwin/run_eval_multi_base_384.sh
new file mode 100644
index 00000000..b97077af
--- /dev/null
+++ b/image_classification/CSwin/run_eval_multi_base_384.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/cswin_base_384.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cswin_base_384' \
diff --git a/image_classification/CSwin/run_eval_multi_large_224.sh b/image_classification/CSwin/run_eval_multi_large_224.sh
new file mode 100644
index 00000000..3f9ba07e
--- /dev/null
+++ b/image_classification/CSwin/run_eval_multi_large_224.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/cswin_large_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cswin_large_224' \
diff --git a/image_classification/CSwin/run_eval_multi_large_384.sh b/image_classification/CSwin/run_eval_multi_large_384.sh
new file mode 100644
index 00000000..844ac1a9
--- /dev/null
+++ b/image_classification/CSwin/run_eval_multi_large_384.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/cswin_large_384.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cswin_large_384' \
diff --git a/image_classification/CSwin/run_eval_multi_tiny.sh b/image_classification/CSwin/run_eval_multi_tiny.sh
new file mode 100644
index 00000000..b9c3c8f7
--- /dev/null
+++ b/image_classification/CSwin/run_eval_multi_tiny.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/cswin_tiny_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cswin_tiny_224' \
diff --git a/image_classification/CSwin/run_eval_small.sh b/image_classification/CSwin/run_eval_small.sh
new file mode 100644
index 00000000..dd88a9d8
--- /dev/null
+++ b/image_classification/CSwin/run_eval_small.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/cswin_small_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cswin_small_224'
diff --git a/image_classification/CSwin/run_eval_tiny.sh b/image_classification/CSwin/run_eval_tiny.sh
new file mode 100644
index 00000000..d27c5033
--- /dev/null
+++ b/image_classification/CSwin/run_eval_tiny.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/cswin_tiny_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cswin_tiny_224'
diff --git a/image_classification/CSwin/run_train_tiny.sh b/image_classification/CSwin/run_train_tiny.sh
new file mode 100644
index 00000000..0203ffda
--- /dev/null
+++ b/image_classification/CSwin/run_train_tiny.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/cswin_tiny_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/CSwin/run_train_tiny_multi.sh b/image_classification/CSwin/run_train_tiny_multi.sh
new file mode 100644
index 00000000..c86fc37a
--- /dev/null
+++ b/image_classification/CSwin/run_train_tiny_multi.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/cswin_tiny_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/CSwin/utils.py b/image_classification/CSwin/utils.py
new file mode 100644
index 00000000..44800527
--- /dev/null
+++ b/image_classification/CSwin/utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/image_classification/CaiT/.config.py.swp b/image_classification/CaiT/.config.py.swp
new file mode 100644
index 00000000..ed536a52
Binary files /dev/null and b/image_classification/CaiT/.config.py.swp differ
diff --git a/image_classification/CaiT/README.md b/image_classification/CaiT/README.md
new file mode 100644
index 00000000..b9ed741d
--- /dev/null
+++ b/image_classification/CaiT/README.md
@@ -0,0 +1,164 @@
+# Going deeper with Image Transformers, [arxiv](https://arxiv.org/abs/2103.17239) 
+
+PaddlePaddle training/validation code and pretrained models for **CaiT**.
+
+The official pytorch implementation is [here](https://github.com/facebookresearch/deit).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./cait.png" alt="drawing" width="80%" height="80%"/>
+    <h4 align="center">CaiT Model Overview</h4>
+</p>
+
+
+### Update 
+Update (2021-08-11): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| cait_xxs24_224                 | 78.38 | 94.32 | 224        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1LKsQUr824oY4E42QeUEaFt41I8xHNseR/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1YIaBLopKIK5_p7NlgWHpGA)(j9m8) |
+| cait_s24_384                   | 85.05 | 97.34 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1GU0esukDvMg3u40FZB_5GiB6qpShjvGh/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1qvhNckJjcEf5HyVn8LuEeA)(qb86) |
+| cait_m48_448                   | 86.49  | 97.75 | 448        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1lJSP__dVERBNFnp7im-1xM3s_lqEe82-/view?usp=sharing)/[baidu](https://pan.baidu.com/s/179MA3MkG2qxFle0K944Gkg)(imk5) |
+
+> *The results are evaluated on ImageNet2012 validation set.
+
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./cait_xxs24_224.pdparams`, to use the `cait_xxs24_224` model in python:
+```python
+from config import get_config
+from cait import build_cait as build_model
+# config files in ./configs/
+config = get_config('./configs/cait_xxs24_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./cait_xxs24_224')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate CaiT model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/cait_xxs24_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./cait_xxs24_224'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/cait_xxs24_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./cait_xxs24_224'
+```
+
+</details>
+
+## Training
+To train the CaiT Transformer model on ImageNet2012 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/cait_xxs24_224.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=32 \
+  -data_path='/dataset/imagenet' \
+```
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/cait_xxs24_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@article{zhang2021gmlp,
+  title={GMLP: Building Scalable and Flexible Graph Neural Networks with Feature-Message Passing},
+  author={Zhang, Wentao and Shen, Yu and Lin, Zheyu and Li, Yang and Li, Xiaosen and Ouyang, Wen and Tao, Yangyu and Yang, Zhi and Cui, Bin},
+  journal={arXiv preprint arXiv:2104.09880},
+  year={2021}
+}
+```
diff --git a/image_classification/CaiT/cait.png b/image_classification/CaiT/cait.png
new file mode 100644
index 00000000..3d3be6e7
Binary files /dev/null and b/image_classification/CaiT/cait.png differ
diff --git a/image_classification/CaiT/cait.py b/image_classification/CaiT/cait.py
new file mode 100644
index 00000000..d8038106
--- /dev/null
+++ b/image_classification/CaiT/cait.py
@@ -0,0 +1,507 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement CaiT Transformer
+"""
+
+import copy
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+
+    The output of this layer is the input without any change.
+    Use this layer to avoid using 'if' condition in forward methods
+    """
+
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embeddings
+
+    Apply patch embeddings on input images. Embeddings is implemented using a Conv2D op.
+
+    Attributes:
+        image_size: int, input image size, default: 224
+        patch_size: int, size of patch, default: 4
+        in_channels: int, input image channels, default: 3
+        embed_dim: int, embedding dimension, default: 96
+    """
+
+    def __init__(self, image_size=224, patch_size=4, in_channels=3, embed_dim=96):
+        super().__init__()
+        image_size = (image_size, image_size)
+        patch_size = (patch_size, patch_size)
+        patches_resolution = [image_size[0]//patch_size[0], image_size[1]//patch_size[1]]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        self.patch_embed = nn.Conv2D(in_channels=in_channels,
+                                     out_channels=embed_dim,
+                                     kernel_size=patch_size,
+                                     stride=patch_size)
+        # CaiT norm is not included
+        #self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        x = self.patch_embed(x) # [batch, embed_dim, h, w] h,w = patch_resolution
+        x = x.flatten(start_axis=2, stop_axis=-1) # [batch, embed_dim, h*w] h*w = num_patches
+        x = x.transpose([0, 2, 1]) # [batch, h*w, embed_dim]
+        #x = self.norm(x) # [batch, num_patches, embed_dim] # CaiT norm is not needed
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+
+    def __init__(self, in_features, hidden_features, dropout=0.):
+        super(Mlp, self).__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             in_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class ClassAttention(nn.Layer):
+    """ Class Attention
+
+    Class Attention module
+
+    Args:
+        dim: int, all heads dimension
+        dim_head: int, single heads dimension, default: None
+        num_heads: int, num of heads
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head ** -0.5
+
+        self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.k = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.v = nn.Linear(dim, dim, bias_attr=qkv_bias)
+
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x):
+        B, N, C = x.shape
+
+        q = self.q(x[:, :1, :]) # same as x[:, 0], but more intuitive
+        q = q.reshape([B, self.num_heads, 1, self.dim_head])
+
+        k = self.k(x)
+        k = k.reshape([B, N, self.num_heads, self.dim_head])
+        k = k.transpose([0, 2, 1, 3])
+
+        v = self.v(x)
+        v = v.reshape([B, N, self.num_heads, self.dim_head])
+        v = v.transpose([0, 2, 1, 3])
+
+        attn = paddle.matmul(q * self.scale, k, transpose_y=True)
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        cls_embed = paddle.matmul(attn, v)
+        cls_embed = cls_embed.transpose([0, 2, 1, 3])
+        cls_embed = cls_embed.reshape([B, 1, C])
+        cls_embed = self.proj(cls_embed)
+        cls_embed = self.proj_dropout(cls_embed)
+        return cls_embed
+
+
+class TalkingHeadAttention(nn.Layer):
+    """ Talking head attention
+
+    Talking head attention (https://arxiv.org/abs/2003.02436),
+    applies linear projections across the attention-heads dimension,
+    before and after the softmax operation.
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 dropout=0.,
+                 attention_dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dim = dim
+        self.dim_head = dim // num_heads
+        self.scale = self.dim_head ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.softmax = nn.Softmax(axis=-1)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(dropout)
+
+        # talking head
+        self.proj_l = nn.Linear(num_heads, num_heads)
+        self.proj_w = nn.Linear(num_heads, num_heads)
+
+    def transpose_multihead(self, x):
+        new_shape = x.shape[:-1] + [self.num_heads, self.dim_head]
+        x = x.reshape(new_shape)
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def forward(self, x):
+        B, H, C = x.shape # H: num_patches
+        qkv = self.qkv(x).chunk(3, axis=-1)
+        q, k, v = map(self.transpose_multihead, qkv) #[B, num_heads, num_patches, single_head_dim]
+
+        q = q * self.scale
+        attn = paddle.matmul(q, k, transpose_y=True) #[B, num_heads, num_patches, num_patches]
+
+        # projection across heads (before softmax)
+        attn = attn.transpose([0, 2, 3, 1]) #[B, num_patches, num_patches, num_heads]
+        attn = self.proj_l(attn)
+        attn = attn.transpose([0, 3, 1, 2]) #[B, num_heads, num_patches, num_patches]
+
+        attn = self.softmax(attn)
+
+        # projection across heads (after softmax)
+        attn = attn.transpose([0, 2, 3, 1]) #[B, num_patches, num_patches, num_heads]
+        attn = self.proj_w(attn)
+        attn = attn.transpose([0, 3, 1, 2]) #[B, num_heads, num_patches, num_patches]
+
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v) #[B, num_heads, num_patches, single_head_dim]
+        z = z.transpose([0, 2, 1, 3]) #[B, num_patches, num_heads, single_head_dim]
+
+        z = z.reshape([B, H, C])
+        z = self.proj(z)
+        z = self.proj_dropout(z)
+
+        return z
+
+
+class LayerScaleBlockClassAttention(nn.Layer):
+    """ LayerScale layers for class attention
+
+    LayerScale layers for class attention contains regular class-attention layers,
+    in addition with gamma_1 and gamma_2, which apply per-channel multiplication
+    after each residual block (attention and mlp layers).
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        mlp_ratio: ratio to multiply on mlp input dim as mlp hidden dim, default: 4.
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        dropout: float, dropout rate for projection dropout, default: 0.
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        init_values: initial values for learnable weights gamma_1 and gamma_2, default: 1e-4
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.,
+                 init_values=1e-4):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.attn = ClassAttention(dim,
+                                   num_heads=num_heads,
+                                   qkv_bias=qkv_bias,
+                                   dropout=dropout,
+                                   attention_dropout=attention_dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+        self.norm2 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       dropout=dropout)
+
+        self.gamma_1 = paddle.create_parameter(
+            shape=[dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(init_values))
+        self.gamma_2 = paddle.create_parameter(
+            shape=[dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(init_values))
+
+    def forward(self, x, x_cls):
+        u = paddle.concat([x_cls, x], axis=1)
+
+        u = self.norm1(u)
+        u = self.attn(u)
+        u = self.gamma_1 * u
+        u = self.drop_path(u)
+        x_cls = u + x_cls
+
+        h = x_cls
+        x_cls = self.norm2(x_cls)
+        x_cls = self.mlp(x_cls)
+        x_cls = self.gamma_2 * x_cls
+        x_cls = self.drop_path(x_cls)
+        x_cls = h + x_cls
+
+        return x_cls
+
+
+class LayerScaleBlock(nn.Layer):
+    """ LayerScale layers
+
+    LayerScale layers contains regular self-attention layers,
+    in addition with gamma_1 and gamma_2, which apply per-channel multiplication
+    after each residual block (attention and mlp layers).
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        mlp_ratio: ratio to multiply on mlp input dim as mlp hidden dim, default: 4.
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        dropout: float, dropout rate for projection dropout, default: 0.
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        init_values: initial values for learnable weights gamma_1 and gamma_2, default: 1e-4
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.,
+                 init_values=1e-4):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.attn = TalkingHeadAttention(dim,
+                                         num_heads=num_heads,
+                                         qkv_bias=qkv_bias,
+                                         dropout=dropout,
+                                         attention_dropout=attention_dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+        self.norm2 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       dropout=dropout)
+
+        self.gamma_1 = paddle.create_parameter(
+            shape=[dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(init_values))
+        self.gamma_2 = paddle.create_parameter(
+            shape=[dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(init_values))
+
+    def forward(self, x):
+        h = x
+        x = self.norm1(x)
+        x = self.attn(x)
+        x = self.gamma_1 * x #[B, num_patches, embed_dim]
+        x = self.drop_path(x)
+        x = h + x
+
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.gamma_2 * x #[B, num_patches, embed_dim]
+        x = self.drop_path(x)
+        x = h + x
+        return x
+
+
+class Cait(nn.Layer):
+    """ CaiT model
+    Args:
+        image_size: int, input image size, default: 224
+        in_channels: int, input image channels, default: 3
+        num_classes: int, num of classes, default: 1000
+        patch_size: int, patch size for patch embedding, default: 16
+        embed_dim: int, dim of each patch after patch embedding, default: 768
+        depth: int, num of self-attention blocks, default: 12
+        num_heads: int, num of attention heads, default: 12
+        mlp_ratio: float, mlp hidden dim = mlp_ratio * mlp_in_dim, default: 4.
+        qkv_bias: bool, if True, qkv projection is set with bias, default: True
+        dropout: float, dropout rate for linear projections, default: 0.
+        attention_dropout: float, dropout rate for attention, default: 0.
+        droppath: float, drop path rate, default: 0.
+        init_values: initial value for layer scales, default: 1e-4
+        mlp_ratio_class_token: float, mlp_ratio for mlp used in class attention blocks, default: 4.0
+        depth_token_only, int, num of class attention blocks, default: 2
+    """
+    def __init__(self,
+                 image_size=224,
+                 in_channels=3,
+                 num_classes=1000,
+                 patch_size=16,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0,
+                 init_values=1e-4,
+                 mlp_ratio_class_token=4.0,
+                 depth_token_only=2):
+        super().__init__()
+        self.num_classes = num_classes
+        # convert image to paches
+        self.patch_embed = PatchEmbedding(image_size=image_size,
+                                          patch_size=patch_size,
+                                          in_channels=in_channels,
+                                          embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        # tokens add for classification
+        self.cls_token = paddle.create_parameter(
+            shape=[1, 1, embed_dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0.0))
+        # positional embeddings for patch positions
+        self.pos_embed = paddle.create_parameter(
+            shape=[1, num_patches, embed_dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0.0))
+
+        self.pos_dropout = nn.Dropout(dropout)
+
+        # create self-attention(layer-scale) layers
+        layer_list = []
+        for i in range(depth):
+            block_layers = LayerScaleBlock(dim=embed_dim,
+                                           num_heads=num_heads,
+                                           mlp_ratio=mlp_ratio,
+                                           qkv_bias=qkv_bias,
+                                           dropout=dropout,
+                                           attention_dropout=attention_dropout,
+                                           droppath=droppath,
+                                           init_values=init_values)
+            layer_list.append(copy.deepcopy(block_layers))
+        self.blocks = nn.LayerList(layer_list)
+
+        # craete class-attention layers
+        layer_list = []
+        for i in range(depth_token_only):
+            block_layers = LayerScaleBlockClassAttention(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio_class_token,
+                qkv_bias=qkv_bias,
+                dropout=0.,
+                attention_dropout=0.,
+                droppath=0.,
+                init_values=init_values)
+            layer_list.append(copy.deepcopy(block_layers))
+        self.blocks_token_only = nn.LayerList(layer_list)
+
+        self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else Identity()
+
+    def forward_features(self, x):
+        # Patch Embedding
+        x = self.patch_embed(x) # [B, num_patches, embed_dim]
+        cls_tokens = self.cls_token.expand([x.shape[0], -1, -1]) # [B, 1, embed_dim]
+        x = x + self.pos_embed
+        x = self.pos_dropout(x)
+        # Self-Attention blocks
+        for idx, block in enumerate(self.blocks):
+            x = block(x) # [B, num_patches, embed_dim]
+        # Class-Attention blocks
+        for idx, block in enumerate(self.blocks_token_only):
+            cls_tokens = block(x, cls_tokens) # [B, 1, embed_dim]
+        # Concat outputs
+        x = paddle.concat([cls_tokens, x], axis=1)
+        x = self.norm(x) # [B, num_patches + 1, embed_dim]
+        return x[:, 0] # returns only cls_tokens
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def build_cait(config):
+    """build cait model using config"""
+    model = Cait(image_size=config.DATA.IMAGE_SIZE,
+                 patch_size=config.MODEL.TRANS.PATCH_SIZE,
+                 embed_dim=config.MODEL.TRANS.EMBED_DIM,
+                 depth=config.MODEL.TRANS.DEPTH,
+                 num_heads=config.MODEL.TRANS.NUM_HEADS,
+                 mlp_ratio=config.MODEL.TRANS.MLP_RATIO,
+                 qkv_bias=config.MODEL.TRANS.QKV_BIAS)
+    return model
diff --git a/image_classification/CaiT/config.py b/image_classification/CaiT/config.py
new file mode 100644
index 00000000..163a1fcd
--- /dev/null
+++ b/image_classification/CaiT/config.py
@@ -0,0 +1,150 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+
+"""
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 256 #256 # train batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #64 # val batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size: 224 for pretrain, 384 for finetune
+_C.DATA.CROP_PCT = 0.875 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads 
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'CaiT'
+_C.MODEL.NAME = 'CaiT'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.1
+_C.MODEL.ATTENTION_DROPOUT = 0.0
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.PATCH_SIZE = 16
+_C.MODEL.TRANS.IN_CHANNELS = 3
+_C.MODEL.TRANS.EMBED_DIM = 192
+_C.MODEL.TRANS.DEPTH = 24
+_C.MODEL.TRANS.DEPTH_TOKEN_ONLY = 2
+_C.MODEL.TRANS.MLP_RATIO = 4.0
+_C.MODEL.TRANS.NUM_HEADS = 4
+_C.MODEL.TRANS.QKV_BIAS = True
+_C.MODEL.TRANS.INIT_VALUES = 1e-5
+
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size
+_C.TRAIN.WEIGHT_DECAY = 0.05 #0.3 # 0.0 for finetune
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0
+_C.TRAIN.END_LR = 5e-4
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 5 # freq to save chpt
+_C.REPORT_FREQ = 100 # freq to logging info
+_C.VALIDATE_FREQ = 100 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/CaiT/configs/cait_m48_448.yaml b/image_classification/CaiT/configs/cait_m48_448.yaml
new file mode 100644
index 00000000..904afd93
--- /dev/null
+++ b/image_classification/CaiT/configs/cait_m48_448.yaml
@@ -0,0 +1,16 @@
+DATA:
+    IMAGE_SIZE: 448
+    CROP_PCT: 1.0
+    
+MODEL:
+    TYPE: cait
+    NAME: cait_m48_448
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 768
+        DEPTH: 48
+        NUM_HEADS: 16
+        MLP_RATIO: 4.0
+        QKV_BIAS: True
+        INIT_VALUES: 1e-6
+        DEPTH_TOKEN_ONLY: 2
diff --git a/image_classification/CaiT/configs/cait_s24_384.yaml b/image_classification/CaiT/configs/cait_s24_384.yaml
new file mode 100644
index 00000000..9e042574
--- /dev/null
+++ b/image_classification/CaiT/configs/cait_s24_384.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+    
+MODEL:
+    TYPE: cait
+    NAME: cait_s24_284
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 384
+        DEPTH: 24
+        NUM_HEADS: 8
+        INIT_VALUES: 1e-5
+        DEPTH_TOKEN_ONLY: 2
diff --git a/image_classification/CaiT/configs/cait_xxs24_224.yaml b/image_classification/CaiT/configs/cait_xxs24_224.yaml
new file mode 100644
index 00000000..0908c7a9
--- /dev/null
+++ b/image_classification/CaiT/configs/cait_xxs24_224.yaml
@@ -0,0 +1,15 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: cait
+    NAME: cait_xxs24_224
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 192
+        DEPTH: 24
+        NUM_HEADS: 4
+        MLP_RATIO: 4.0
+        QKV_BIAS: True
+        INIT_VALUES: 1e-5
+        DEPTH_TOKEN_ONLY: 2
diff --git a/image_classification/CaiT/datasets.py b/image_classification/CaiT/datasets.py
new file mode 100644
index 00000000..66afc611
--- /dev/null
+++ b/image_classification/CaiT/datasets.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, 'bicubic'),
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/CaiT/droppath.py b/image_classification/CaiT/droppath.py
new file mode 100644
index 00000000..25b8d5ff
--- /dev/null
+++ b/image_classification/CaiT/droppath.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import paddle
+import paddle.nn as nn
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def drop_path(self, inputs):
+        """drop path op
+        Args:
+            input: tensor with arbitrary shape
+            drop_prob: float number of drop path probability, default: 0.0
+            training: bool, if current mode is training, default: False
+        Returns:
+            output: output tensor after drop path
+        """
+        # if prob is 0 or eval mode, return original input
+        if self.drop_prob == 0. or not self.training:
+            return inputs
+        keep_prob = 1 - self.drop_prob
+        keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+        random_tensor = random_tensor.floor() # mask
+        output = inputs.divide(keep_prob) * random_tensor #divide is to keep same output expectation
+        return output
+
+    def forward(self, inputs):
+        return self.drop_path(inputs)
+
+
+#def main():
+#    tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32')
+#    dp = DropPath(0.5)
+#    out = dp(tmp)
+#    print(out)
+#
+#if __name__ == "__main__":
+#    main()
diff --git a/image_classification/CaiT/main_multi_gpu.py b/image_classification/CaiT/main_multi_gpu.py
new file mode 100644
index 00000000..d14970f7
--- /dev/null
+++ b/image_classification/CaiT/main_multi_gpu.py
@@ -0,0 +1,362 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""CaiT training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader, get_dataset
+from cait import build_cait as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('CaiT')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CaiT/main_single_gpu.py b/image_classification/CaiT/main_single_gpu.py
new file mode 100644
index 00000000..5432c23b
--- /dev/null
+++ b/image_classification/CaiT/main_single_gpu.py
@@ -0,0 +1,332 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""CaiT training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from cait import build_cait as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('CaiT')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+    # 1. Create model
+    model = build_model(config)
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  )
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CaiT/port_weights/load_pytorch_weights_m48_448.py b/image_classification/CaiT/port_weights/load_pytorch_weights_m48_448.py
new file mode 100644
index 00000000..2a92152e
--- /dev/null
+++ b/image_classification/CaiT/port_weights/load_pytorch_weights_m48_448.py
@@ -0,0 +1,193 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from cait import *
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/cait_m48_448.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+        ('patch_embed.proj', f'patch_embed.patch_embed'),
+    ]
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        pp_prefix = f'blocks.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.gamma_1', f'{pp_prefix}.gamma_1'),
+            (f'{th_prefix}.gamma_2', f'{pp_prefix}.gamma_2'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.attn.proj_l', f'{pp_prefix}.attn.proj_l'),
+            (f'{th_prefix}.attn.proj_w', f'{pp_prefix}.attn.proj_w'),
+        ]
+        mapping.extend(layer_mapping)
+
+    num_layers = config.MODEL.TRANS.DEPTH_TOKEN_ONLY
+    for idx in range(num_layers):
+        pp_prefix = f'blocks_token_only.{idx}'
+        th_prefix = f'blocks_token_only.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.gamma_1', f'{pp_prefix}.gamma_1'),
+            (f'{th_prefix}.gamma_2', f'{pp_prefix}.gamma_2'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            (f'{th_prefix}.attn.q', f'{pp_prefix}.attn.q'),
+            (f'{th_prefix}.attn.k', f'{pp_prefix}.attn.k'),
+            (f'{th_prefix}.attn.v', f'{pp_prefix}.attn.v'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head')
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_cait(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = timm.create_model('cait_m48_448', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 448, 448).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-4)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./cait_m48_448.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CaiT/port_weights/load_pytorch_weights_s24_384.py b/image_classification/CaiT/port_weights/load_pytorch_weights_s24_384.py
new file mode 100644
index 00000000..45e811cb
--- /dev/null
+++ b/image_classification/CaiT/port_weights/load_pytorch_weights_s24_384.py
@@ -0,0 +1,194 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from cait import *
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/cait_s24_384.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+        ('patch_embed.proj', f'patch_embed.patch_embed'),
+    ]
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        pp_prefix = f'blocks.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.gamma_1', f'{pp_prefix}.gamma_1'),
+            (f'{th_prefix}.gamma_2', f'{pp_prefix}.gamma_2'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.attn.proj_l', f'{pp_prefix}.attn.proj_l'),
+            (f'{th_prefix}.attn.proj_w', f'{pp_prefix}.attn.proj_w'),
+        ]
+        mapping.extend(layer_mapping)
+
+    num_layers = config.MODEL.TRANS.DEPTH_TOKEN_ONLY
+    for idx in range(num_layers):
+        pp_prefix = f'blocks_token_only.{idx}'
+        th_prefix = f'blocks_token_only.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.gamma_1', f'{pp_prefix}.gamma_1'),
+            (f'{th_prefix}.gamma_2', f'{pp_prefix}.gamma_2'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            (f'{th_prefix}.attn.q', f'{pp_prefix}.attn.q'),
+            (f'{th_prefix}.attn.k', f'{pp_prefix}.attn.k'),
+            (f'{th_prefix}.attn.v', f'{pp_prefix}.attn.v'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head')
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_cait(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = timm.create_model('cait_s24_384', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 384, 384).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./cait_s24_384.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CaiT/port_weights/load_pytorch_weights_xxs24_224.py b/image_classification/CaiT/port_weights/load_pytorch_weights_xxs24_224.py
new file mode 100644
index 00000000..3fcf635c
--- /dev/null
+++ b/image_classification/CaiT/port_weights/load_pytorch_weights_xxs24_224.py
@@ -0,0 +1,192 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from cait import *
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/cait_xxs24_224.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+        ('patch_embed.proj', f'patch_embed.patch_embed'),
+    ]
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        pp_prefix = f'blocks.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.gamma_1', f'{pp_prefix}.gamma_1'),
+            (f'{th_prefix}.gamma_2', f'{pp_prefix}.gamma_2'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.attn.proj_l', f'{pp_prefix}.attn.proj_l'),
+            (f'{th_prefix}.attn.proj_w', f'{pp_prefix}.attn.proj_w'),
+        ]
+        mapping.extend(layer_mapping)
+
+    num_layers = config.MODEL.TRANS.DEPTH_TOKEN_ONLY
+    for idx in range(num_layers):
+        pp_prefix = f'blocks_token_only.{idx}'
+        th_prefix = f'blocks_token_only.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.gamma_1', f'{pp_prefix}.gamma_1'),
+            (f'{th_prefix}.gamma_2', f'{pp_prefix}.gamma_2'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            (f'{th_prefix}.attn.q', f'{pp_prefix}.attn.q'),
+            (f'{th_prefix}.attn.k', f'{pp_prefix}.attn.k'),
+            (f'{th_prefix}.attn.v', f'{pp_prefix}.attn.v'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head')
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_cait(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = timm.create_model('cait_xxs24_224', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./cait_xxs24_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/CaiT/run_eval.sh b/image_classification/CaiT/run_eval.sh
new file mode 100644
index 00000000..b3568ca8
--- /dev/null
+++ b/image_classification/CaiT/run_eval.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/cait_xxs24_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=4 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cait_xxs24_224'
\ No newline at end of file
diff --git a/image_classification/CaiT/run_eval_multi.sh b/image_classification/CaiT/run_eval_multi.sh
new file mode 100644
index 00000000..e0732977
--- /dev/null
+++ b/image_classification/CaiT/run_eval_multi.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/cait_xxs24_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cait_xxs24_224' \
+-ngpus=4
diff --git a/image_classification/CaiT/run_eval_multi_m48_448.sh b/image_classification/CaiT/run_eval_multi_m48_448.sh
new file mode 100644
index 00000000..f98b01db
--- /dev/null
+++ b/image_classification/CaiT/run_eval_multi_m48_448.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/cait_m48_448.yaml' \
+-dataset='imagenet2012' \
+-batch_size=1 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cait_m48_448' \
+-ngpus=4
diff --git a/image_classification/CaiT/run_eval_multi_s24_384.sh b/image_classification/CaiT/run_eval_multi_s24_384.sh
new file mode 100644
index 00000000..6517ed15
--- /dev/null
+++ b/image_classification/CaiT/run_eval_multi_s24_384.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/cait_s24_384.yaml' \
+-dataset='imagenet2012' \
+-batch_size=16 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cait_s24_384' \
+-ngpus=4
diff --git a/image_classification/CaiT/run_eval_multi_xxs24_224.sh b/image_classification/CaiT/run_eval_multi_xxs24_224.sh
new file mode 100644
index 00000000..e0732977
--- /dev/null
+++ b/image_classification/CaiT/run_eval_multi_xxs24_224.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/cait_xxs24_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./cait_xxs24_224' \
+-ngpus=4
diff --git a/image_classification/CaiT/run_train.sh b/image_classification/CaiT/run_train.sh
new file mode 100644
index 00000000..369ada22
--- /dev/null
+++ b/image_classification/CaiT/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/cait_xxs24_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=4 \
+-data_path='/dataset/imagenet'
diff --git a/image_classification/CaiT/run_train_multi.sh b/image_classification/CaiT/run_train_multi.sh
new file mode 100644
index 00000000..33d4b09d
--- /dev/null
+++ b/image_classification/CaiT/run_train_multi.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/cait_xxs24_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/CaiT/tests/__init__.py b/image_classification/CaiT/tests/__init__.py
new file mode 100644
index 00000000..e2cbd538
--- /dev/null
+++ b/image_classification/CaiT/tests/__init__.py
@@ -0,0 +1 @@
+#init
diff --git a/image_classification/CaiT/tests/test_cait.py b/image_classification/CaiT/tests/test_cait.py
new file mode 100644
index 00000000..85c24183
--- /dev/null
+++ b/image_classification/CaiT/tests/test_cait.py
@@ -0,0 +1,75 @@
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+from config import *
+from cait import *
+
+
+class CaitTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+        cls.config = get_config()
+        cls.dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        cls.dummy_tensor = paddle.to_tensor(cls.dummy_img)
+    
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    #@unittest.skip('skip for debug')
+    def test_identity(self):
+        layer = Identity()
+        out = layer(CaitTest.dummy_tensor)
+        self.assertTrue(np.allclose(out.numpy(), CaitTest.dummy_tensor.numpy()))
+
+    def test_patchembedding(self):
+        layer = PatchEmbedding()
+        tensor = paddle.randn(shape=[4, 3, 224, 224])
+        out = layer(tensor)
+        self.assertEqual([4, 3136, 96], out.shape)
+
+    def test_mlp(self):
+        layer = Mlp(in_features=128, hidden_features=64, dropout=0.1)
+        tensor = paddle.randn(shape=[4, 128])
+        out = layer(tensor)
+        self.assertEqual([4, 128], out.shape)
+
+    def test_talkinghead_attention(self):
+        layer = TalkingHeadAttention(dim=64,  num_heads=8)
+        tensor = paddle.randn(shape=[4, 196, 64])
+        out = layer(tensor)
+        self.assertEqual([4, 196, 64], out.shape)
+
+    def test_layer_scale_block(self):
+        layer = LayerScaleBlock(dim=64, num_heads=8)
+        tensor = paddle.randn(shape=[4, 196, 64])
+        out = layer(tensor)
+        self.assertEqual([4, 196, 64], out.shape)
+
+    def test_class_attention(self):
+        layer = ClassAttention(dim=64)
+        tensor = paddle.randn(shape=[4, 196, 64])
+        out = layer(tensor)
+        self.assertEqual([4, 1, 64], out.shape)
+
+    def test_layer_scale_block_class_attention(self):
+        layer = LayerScaleBlockClassAttention(dim=64, num_heads=8)
+        tensor = paddle.randn(shape=[4, 196, 64])
+        cls_tensor = paddle.randn(shape=[4, 1, 64])
+        out = layer(tensor, cls_tensor)
+        self.assertEqual([4, 1, 64], out.shape)
+
+    #@unittest.skip('skip for debug')
+    def test_build_model(self):
+        print(CaitTest.config)
+        model = build_cait(CaitTest.config)
+        print(model)
+
+    #@unittest.skip('skip for debug')
+    def test_model_inference(self):
+        print(CaitTest.config)
+        model = build_cait(CaitTest.config)
+        print(model(CaitTest.dummy_tensor))
+
diff --git a/image_classification/CaiT/utils.py b/image_classification/CaiT/utils.py
new file mode 100644
index 00000000..44800527
--- /dev/null
+++ b/image_classification/CaiT/utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/image_classification/DeiT/README.md b/image_classification/DeiT/README.md
new file mode 100644
index 00000000..208667aa
--- /dev/null
+++ b/image_classification/DeiT/README.md
@@ -0,0 +1,173 @@
+# Training data-efficient image transformers & distillation through attention, [arxiv](https://arxiv.org/abs/2012.12877) 
+
+PaddlePaddle training/validation code and pretrained models for **DeiT**.
+
+The official pytorch implementation is [here](https://github.com/facebookresearch/deit).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./deit.png" alt="drawing" width="60%" height="60%"/>
+<h4 align="center">DeiT Model Overview</h4>
+</p>
+
+### Update 
+Update (2021-08-11): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| deit_base_distilled_patch16_224| 83.32  | 96.49 | 224        | 0.875    | bicubic       | [google](https://drive.google.com/file/d/12_x6-NN3Jde2BFUih4OM9NlTwe9-Xlkw/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1ZnmAWgT6ewe7Vl3Xw_csuA)(5f2g) |
+| deit_base_distilled_patch16_384| 85.43  | 97.33 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1i5H_zjSdHfM-Znv89DHTv9ChykWrIt8I/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1PQsQIci4VCHY7l2tCzMklg)(qgj2) |
+
+| Teacher Model | Link |
+| -- | -- |
+| RegNet_Y_160  | [google](https://drive.google.com/file/d/1_nEYFnQqlGGqboLq_VmdRvV9mLGSrbyG/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1NZNhiO4xDfqHiRiIbk9BCA)(gjsm)   |
+
+> *The results are evaluated on ImageNet2012 validation set.
+
+
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./deit_base_patch16_224.pdparams`, to use the `deit_base_patch16_224` model in python:
+```python
+from config import get_config
+from deit import build_deit as build_model
+# config files in ./configs/
+config = get_config('./configs/deit_base_patch16_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./deit_base_patch16_224')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate DeiT model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/deit_base_patch16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./deit_base_patch16_224'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/deit_base_patch16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./deit_base_patch16_224'
+```
+
+</details>
+
+
+
+## Training
+To train the DeiT Transformer model on ImageNet2012 with single GPU, **download** the pretrained weights of **teacher** model (`regnety_160.pdparams`) and run the following script using command line:
+
+```shell
+sh run_train_single.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/deit_base_patch16_224.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=32 \
+  -data_path='/dataset/imagenet' \
+  -teacher_model='./regnety_160'
+```
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/deit_base_patch16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -teacher_model='./regnety_160'
+```
+
+</details>
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@inproceedings{touvron2021training,
+  title={Training data-efficient image transformers \& distillation through attention},
+  author={Touvron, Hugo and Cord, Matthieu and Douze, Matthijs and Massa, Francisco and Sablayrolles, Alexandre and J{\'e}gou, Herv{\'e}},
+  booktitle={International Conference on Machine Learning},
+  pages={10347--10357},
+  year={2021},
+  organization={PMLR}
+}
+```
diff --git a/image_classification/DeiT/auto_augment.py b/image_classification/DeiT/auto_augment.py
new file mode 100644
index 00000000..a8daf02b
--- /dev/null
+++ b/image_classification/DeiT/auto_augment.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Auto Augmentation"""
+
+import random
+import numpy as np
+from PIL import Image, ImageEnhance, ImageOps
+
+
+def auto_augment_policy_original():
+    """ImageNet auto augment policy"""
+    policy = [
+        [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)],        
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],        
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],        
+        [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)],        
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],        
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],        
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],        
+        [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)],        
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],        
+        [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)],        
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],        
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],        
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],        
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],        
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],        
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],        
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],        
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],        
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],        
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],        
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],        
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],        
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],        
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],        
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],        
+    ]
+    policy = [[SubPolicy(*args) for args in subpolicy] for subpolicy in policy]
+    return policy
+
+
+class AutoAugment():
+    """Auto Augment
+    Randomly choose a tuple of augment ops from a list of policy
+    Then apply the tuple of augment ops to input image
+    """
+    def __init__(self, policy):
+        self.policy = policy
+    
+    def __call__(self, image, policy_idx=None):
+        if policy_idx is None:
+            policy_idx = random.randint(0, len(self.policy)-1)
+
+        sub_policy = self.policy[policy_idx]
+        for op in sub_policy:
+            image = op(image)
+        return image
+
+
+class SubPolicy:
+    """Subpolicy
+    Read augment name and magnitude, apply augment with probability
+    Args:
+        op_name: str, augment operation name
+        prob: float, if prob > random prob, apply augment
+        magnitude_idx: int, index of magnitude in preset magnitude ranges
+    """
+    def __init__(self, op_name, prob, magnitude_idx):
+        # ranges of operations' magnitude
+        ranges = {
+            'ShearX': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative)
+            'ShearY': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative)
+            'TranslateX': np.linspace(0, 150 / 331, 10), #[-0.45, 0.45] (by random negative)
+            'TranslateY': np.linspace(0, 150 / 331, 10), #[-0.45, 0.45] (by random negative)
+            'Rotate': np.linspace(0, 30, 10), #[-30, 30] (by random negative)
+            'Color': np.linspace(0, 0.9, 10), #[-0.9, 0.9] (by random negative)
+            'Posterize': np.round(np.linspace(8, 4, 10), 0).astype(np.int), #[0, 4]
+            'Solarize': np.linspace(256, 0, 10), #[0, 256]
+            'Contrast': np.linspace(0, 0.9, 10), #[-0.9, 0.9] (by random negative)
+            'Sharpness': np.linspace(0, 0.9, 10), #[-0.9, 0.9] (by random negative)
+            'Brightness': np.linspace(0, 0.9, 10), #[-0.9, 0.9] (by random negative)
+            'AutoContrast': [0] * 10, # no range
+            'Equalize': [0] * 10, # no range
+            'Invert': [0] * 10, # no range
+        }
+        
+        # augmentation operations 
+        # Lambda is not pickleable for DDP
+        #image_ops = {
+        #    'ShearX': lambda image, magnitude: shear_x(image, magnitude),   
+        #    'ShearY': lambda image, magnitude: shear_y(image, magnitude),   
+        #    'TranslateX': lambda image, magnitude: translate_x(image, magnitude),   
+        #    'TranslateY': lambda image, magnitude: translate_y(image, magnitude),   
+        #    'Rotate': lambda image, magnitude: rotate(image, magnitude),   
+        #    'AutoContrast': lambda image, magnitude: auto_contrast(image, magnitude),   
+        #    'Invert': lambda image, magnitude: invert(image, magnitude),   
+        #    'Equalize': lambda image, magnitude: equalize(image, magnitude),   
+        #    'Solarize': lambda image, magnitude: solarize(image, magnitude),   
+        #    'Posterize': lambda image, magnitude: posterize(image, magnitude),   
+        #    'Contrast': lambda image, magnitude: contrast(image, magnitude),   
+        #    'Color': lambda image, magnitude: color(image, magnitude),   
+        #    'Brightness': lambda image, magnitude: brightness(image, magnitude),   
+        #    'Sharpness': lambda image, magnitude: sharpness(image, magnitude),   
+        #}
+        image_ops = {
+            'ShearX': shear_x,   
+            'ShearY': shear_y,   
+            'TranslateX': translate_x_relative,   
+            'TranslateY': translate_y_relative,   
+            'Rotate': rotate,   
+            'AutoContrast': auto_contrast,   
+            'Invert': invert,   
+            'Equalize': equalize,   
+            'Solarize': solarize,   
+            'Posterize': posterize,   
+            'Contrast': contrast,   
+            'Color': color,   
+            'Brightness': brightness,   
+            'Sharpness': sharpness,   
+        }
+
+        self.prob = prob
+        self.magnitude = ranges[op_name][magnitude_idx]
+        self.op = image_ops[op_name]
+
+    def __call__(self, image):
+        if self.prob > random.random():
+            image = self.op(image, self.magnitude)
+        return image
+
+
+# PIL Image transforms
+# https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.transform
+def shear_x(image, magnitude, fillcolor=(128, 128, 128)):
+    factor = magnitude * random.choice([-1, 1]) # random negative
+    return image.transform(image.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), fillcolor=fillcolor)
+
+
+def shear_y(image, magnitude, fillcolor=(128, 128, 128)):
+    factor = magnitude * random.choice([-1, 1]) # random negative
+    return image.transform(image.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), fillcolor=fillcolor)
+
+
+def translate_x_relative(image, magnitude, fillcolor=(128, 128, 128)):
+    pixels = magnitude * image.size[0]
+    pixels = pixels * random.choice([-1, 1]) # random negative
+    return image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), fillcolor=fillcolor)
+
+
+def translate_y_relative(image, magnitude, fillcolor=(128, 128, 128)):
+    pixels = magnitude * image.size[0]
+    pixels = pixels * random.choice([-1, 1]) # random negative
+    return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), fillcolor=fillcolor)
+
+
+def translate_x_absolute(image, magnitude, fillcolor=(128, 128, 128)):
+    magnitude = magnitude * random.choice([-1, 1]) # random negative
+    return image.transform(image.size, Image.AFFINE, (1, 0, magnitude, 0, 1, 0), fillcolor=fillcolor)
+
+
+def translate_y_absolute(image, magnitude, fillcolor=(128, 128, 128)):
+    magnitude = magnitude * random.choice([-1, 1]) # random negative
+    return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude), fillcolor=fillcolor)
+
+
+def rotate(image, magnitude):
+    rot = image.convert("RGBA").rotate(magnitude)
+    return Image.composite(rot,
+                           Image.new('RGBA', rot.size, (128, ) * 4),
+                           rot).convert(image.mode)
+
+
+def auto_contrast(image, magnitude=None):
+    return ImageOps.autocontrast(image)
+
+
+def invert(image, magnitude=None):
+    return ImageOps.invert(image)
+
+
+def equalize(image, magnitude=None):
+    return ImageOps.equalize(image)
+
+
+def solarize(image, magnitude):
+    return ImageOps.solarize(image, magnitude)
+
+
+def posterize(image, magnitude):
+    return ImageOps.posterize(image, magnitude)
+
+
+def contrast(image, magnitude):
+    magnitude = magnitude * random.choice([-1, 1]) # random negative
+    return ImageEnhance.Contrast(image).enhance(1 + magnitude)
+
+
+def color(image, magnitude):
+    magnitude = magnitude * random.choice([-1, 1]) # random negative
+    return ImageEnhance.Color(image).enhance(1 + magnitude)
+
+
+def brightness(image, magnitude):
+    magnitude = magnitude * random.choice([-1, 1]) # random negative
+    return ImageEnhance.Brightness(image).enhance(1 + magnitude)
+
+
+def sharpness(image, magnitude):
+    magnitude = magnitude * random.choice([-1, 1]) # random negative
+    return ImageEnhance.Sharpness(image).enhance(1 + magnitude)
+
diff --git a/image_classification/DeiT/config.py b/image_classification/DeiT/config.py
new file mode 100644
index 00000000..5bdcf9ea
--- /dev/null
+++ b/image_classification/DeiT/config.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+"""
+
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 256 #256 # train batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #64 # val batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size: 224 for pretrain, 384 for finetune
+_C.DATA.CROP_PCT = 0.875 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads 
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'DeiT'
+_C.MODEL.NAME = 'DeiT'
+_C.MODEL.RESUME = None
+_C.MODEL.RESUME_EMA = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.1
+_C.MODEL.ATTENTION_DROPOUT = 0.0
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.PATCH_SIZE = 16
+_C.MODEL.TRANS.IN_CHANNELS = 3
+_C.MODEL.TRANS.EMBED_DIM = 192
+_C.MODEL.TRANS.DEPTH = 12
+_C.MODEL.TRANS.MLP_RATIO = 4.0
+_C.MODEL.TRANS.NUM_HEADS = 4
+_C.MODEL.TRANS.QKV_BIAS = True
+_C.MODEL.TRANS.INIT_VALUES = 1e-5
+
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size
+_C.TRAIN.WEIGHT_DECAY = 0.05 #0.3 # 0.0 for finetune
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0
+_C.TRAIN.END_LR = 5e-4
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# train augmentation
+_C.TRAIN.MIXUP_ALPHA = 0.8
+_C.TRAIN.CUTMIX_ALPHA = 1.0
+_C.TRAIN.CUTMIX_MINMAX = None
+_C.TRAIN.MIXUP_PROB = 1.0
+_C.TRAIN.MIXUP_SWITCH_PROB = 0.5
+_C.TRAIN.MIXUP_MODE = 'batch'
+
+_C.TRAIN.SMOOTHING = 0.1
+_C.TRAIN.COLOR_JITTER = 0.4
+_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1'
+
+_C.TRAIN.RANDOM_ERASE_PROB = 0.25
+_C.TRAIN.RANDOM_ERASE_MODE = 'pixel'
+_C.TRAIN.RANDOM_ERASE_COUNT = 1
+_C.TRAIN.RANDOM_ERASE_SPLIT = False
+
+_C.TRAIN.DISTILLATION_TYPE = 'hard' # hard, soft, none 
+_C.TRAIN.DISTILLATION_ALPHA = 0.5
+_C.TRAIN.DISTILLATION_TAU = 1.0
+_C.TRAIN.TEACHER_MODEL = './regnety_160' # no ext is needed
+
+_C.TRAIN.MODEL_EMA = True
+_C.TRAIN.MODEL_EMA_DECAY = 0.99996 
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 5 # freq to save chpt
+_C.REPORT_FREQ = 100 # freq to logging info
+_C.VALIDATE_FREQ = 100 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+    if args.teacher_model:
+        config.TRAIN.TEACHER_MODEL = args.teacher_model
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/DeiT/configs/deit_base_patch16_224.yaml b/image_classification/DeiT/configs/deit_base_patch16_224.yaml
new file mode 100644
index 00000000..dd0f608d
--- /dev/null
+++ b/image_classification/DeiT/configs/deit_base_patch16_224.yaml
@@ -0,0 +1,23 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: DeiT
+    NAME: deit_base_patch16_224
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 768
+        MLP_RATIO: 4.0
+        DEPTH: 12
+        NUM_HEADS: 12
+        QKV_BIAS: True
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 5
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
+
+
diff --git a/image_classification/DeiT/configs/deit_base_patch16_384.yaml b/image_classification/DeiT/configs/deit_base_patch16_384.yaml
new file mode 100644
index 00000000..8d21883f
--- /dev/null
+++ b/image_classification/DeiT/configs/deit_base_patch16_384.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: DeiT
+    NAME: deit_base_patch16_384
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 768
+        MLP_RATIO: 4.0
+        DEPTH: 12
+        NUM_HEADS: 12
+        QKV_BIAS: True
+
diff --git a/image_classification/DeiT/configs/regnety_160.yaml b/image_classification/DeiT/configs/regnety_160.yaml
new file mode 100644
index 00000000..2efe559c
--- /dev/null
+++ b/image_classification/DeiT/configs/regnety_160.yaml
@@ -0,0 +1,8 @@
+DATA:
+    IMAGE_SIZE: 224
+    #IMAGE_SIZE: 288
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: RegNet
+    NAME: regnety_160
+
diff --git a/image_classification/DeiT/datasets.py b/image_classification/DeiT/datasets.py
new file mode 100644
index 00000000..067bbe39
--- /dev/null
+++ b/image_classification/DeiT/datasets.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from PIL import Image
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+from auto_augment import auto_augment_policy_original
+from auto_augment import AutoAugment
+from random_erasing import RandomErasing
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = Image.open(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+    aug_op_list = []
+    # random crop and resize
+    aug_op_list.append(
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)))
+    # auto_augment / color jitter
+    if config.TRAIN.AUTO_AUGMENT:
+        policy = auto_augment_policy_original()
+        auto_augment = AutoAugment(policy)
+        aug_op_list.append(auto_augment)
+    else:
+        jitter = (float(config.TRAIN.COLOR_JITTER),) * 3
+        aug_op_list.append(transforms.ColorJitter(jitter))
+    # other ops
+    aug_op_list.append(transforms.ToTensor())
+    aug_op_list.append(transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                            std=[0.229, 0.224, 0.225]))
+    # random erasing
+    if config.TRAIN.RANDOM_ERASE_PROB > 0.:
+        random_erasing = RandomErasing(prob=config.TRAIN.RANDOM_ERASE_PROB,
+                                       mode=config.TRAIN.RANDOM_ERASE_MODE,
+                                       max_count=config.TRAIN.RANDOM_ERASE_COUNT,
+                                       num_splits=config.TRAIN.RANDOM_ERASE_SPLIT)
+        aug_op_list.append(random_erasing)
+
+    transforms_train = transforms.Compose(aug_op_list)
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, 'bicubic'),
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/DeiT/deit.png b/image_classification/DeiT/deit.png
new file mode 100644
index 00000000..2ab6a921
Binary files /dev/null and b/image_classification/DeiT/deit.png differ
diff --git a/image_classification/DeiT/deit.py b/image_classification/DeiT/deit.py
new file mode 100644
index 00000000..995440c9
--- /dev/null
+++ b/image_classification/DeiT/deit.py
@@ -0,0 +1,308 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement DeiT
+"""
+
+import math
+import copy
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+    
+    The output of this layer is the input without any change.
+    Use this layer to avoid using 'if' condition in forward methods
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embeddings
+
+    Then a proj (conv2d) layer is applied as the patch embedding.
+
+    Args:
+        image_size: int, input image size, default: 224
+        patch_size: int, patch size for patch embedding (k and stride for proj conv), default: 8
+        in_channels: int, input channels, default: 3
+        embed_dim: int, output dimension of patch embedding, default: 384
+    """
+    def __init__(self,
+                 image_size=224,
+                 patch_size=8,
+                 in_channels=3,
+                 embed_dim=384):
+        super().__init__()
+        assert patch_size in [4, 8, 16]
+        
+        # define patch embeddings
+        self.proj = nn.Conv2D(in_channels,
+                              embed_dim,
+                              kernel_size = patch_size,
+                              stride = patch_size)
+        # num patches
+        self.num_patches = (image_size // patch_size) * (image_size // patch_size)
+
+    def forward(self, x):
+        x = self.proj(x)
+        x = x.flatten(2)
+        x = x.transpose([0, 2, 1])
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+    
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+    
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+    
+    def __init__(self, in_features, hidden_features, dropout=0.):
+        super(Mlp, self).__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+    
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             in_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+    
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+    
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class Attention(nn.Layer):
+    """ Attention
+
+    Regular Attention module same as ViT
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.embed_dim = dim
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.softmax = nn.Softmax(axis=-1)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(dropout)
+
+    def transpose_multihead(self, x):
+        new_shape = x.shape[:-1] + [self.num_heads, self.dim_head]
+        x = x.reshape(new_shape)
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def forward(self, x):
+        qkv = self.qkv(x).chunk(3, axis=-1)
+        q, k, v = map(self.transpose_multihead, qkv)
+
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = attn * self.scale
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 2, 1, 3])
+
+        new_shape = z.shape[:-2] + [self.embed_dim]
+        z = z.reshape(new_shape)
+        z = self.proj(z)
+        z = self.proj_dropout(z)
+
+        return z
+
+
+class EncoderLayer(nn.Layer):
+    """Transformer Encoder Layer
+
+    Transformer encoder module, same as ViT
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        mlp_ratio: float, ratio to multiply with dim for mlp hidden feature dim, default: 4.
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0,
+                 droppath=0.):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.attn = Attention(dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              attention_dropout=attention_dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+        self.norm2 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio))
+    
+    def forward(self, x):
+        h = x
+        x = self.norm1(x)
+        x = self.attn(x)
+        x = self.drop_path(x)
+        x = h + x
+
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = h + x
+
+        return x
+
+
+class Deit(nn.Layer):
+    def __init__(self,
+                 image_size=224,
+                 in_channels=3,
+                 num_classes=1000,
+                 patch_size=16,
+                 embed_dim=192,
+                 num_heads=3,
+                 depth=12,
+                 mlp_ratio=4,
+                 qkv_bias=True,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        self.num_classes = num_classes
+        # patch embedding
+        self.patch_embed = PatchEmbedding(image_size=image_size,
+                                          patch_size=patch_size,
+                                          in_channels=in_channels,
+                                          embed_dim=embed_dim)
+        # class token
+        self.class_token = paddle.create_parameter(
+            shape=[1, 1, embed_dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0.))
+        # distillation token
+        self.distill_token = paddle.create_parameter(
+            shape=[1, 1, embed_dim],
+            dtype='float32',
+            default_initializer=nn.initializer.TruncatedNormal(std=.02))
+        # positional embedding
+        self.pos_embed = paddle.create_parameter(
+            shape=[1, self.patch_embed.num_patches + 2, embed_dim],
+            dtype='float32',
+            default_initializer=nn.initializer.TruncatedNormal(std=.02))
+        self.pos_dropout = nn.Dropout(dropout)
+
+        self.layers = nn.LayerList([
+            copy.deepcopy(EncoderLayer(dim=embed_dim,
+                                       num_heads=num_heads,
+                                       mlp_ratio=mlp_ratio,
+                                       qkv_bias=qkv_bias,
+                                       attention_dropout=attention_dropout,
+                                       droppath=droppath)) for _ in range(depth)])
+        self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
+
+        self.head = nn.Linear(embed_dim, num_classes)
+        self.head_distill = nn.Linear(embed_dim, num_classes) 
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        class_tokens = self.class_token.expand([x.shape[0], -1, -1])
+        distill_tokens = self.distill_token.expand([x.shape[0], -1, -1])
+        x = paddle.concat((class_tokens, distill_tokens, x), axis=1)
+
+        x = x + self.pos_embed
+        x = self.pos_dropout(x)
+
+        for layer in self.layers:
+            x = layer(x)
+        x = self.norm(x)
+
+        return x[:, 0], x[:, 1]
+
+    def forward(self, x):
+        x, x_distill = self.forward_features(x)
+        x = self.head(x)
+        x_distill = self.head_distill(x_distill)
+        if self.training:
+            return x, x_distill
+        else:
+            return (x + x_distill) / 2
+
+
+def build_deit(config):
+    """build deit model using config"""
+    model = Deit(image_size=config.DATA.IMAGE_SIZE,
+                 depth=config.MODEL.TRANS.DEPTH,
+                 embed_dim=config.MODEL.TRANS.EMBED_DIM,
+                 mlp_ratio=config.MODEL.TRANS.MLP_RATIO,
+                 num_heads=config.MODEL.TRANS.NUM_HEADS,
+                 qkv_bias=config.MODEL.TRANS.QKV_BIAS)
+    return model
diff --git a/image_classification/DeiT/losses.py b/image_classification/DeiT/losses.py
new file mode 100644
index 00000000..082467a3
--- /dev/null
+++ b/image_classification/DeiT/losses.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Implement Loss functions """
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class LabelSmoothingCrossEntropyLoss(nn.Layer):
+    """ cross entropy loss for label smoothing
+    Args:
+        smoothing: float, smoothing rate
+        x: tensor, predictions (before softmax) with shape [N, num_classes]
+        target: tensor, target label with shape [N]
+    Return:
+        loss: float, cross entropy loss value
+    """
+    def __init__(self, smoothing=0.1):
+        super().__init__()
+        assert 0 <= smoothing < 1.0
+        self.smoothing = smoothing
+        self.confidence = 1 - smoothing
+
+    def forward(self, x, target):
+        log_probs = F.log_softmax(x) # [N, num_classes]
+        # target_index is used to get prob for each of the N samples
+        target_index = paddle.zeros([x.shape[0], 2], dtype='int64') # [N, 2]
+        target_index[:, 0] = paddle.arange(x.shape[0])
+        target_index[:, 1] = target
+
+        nll_loss = -log_probs.gather_nd(index=target_index) # index: [N]
+        smooth_loss = -log_probs.mean(axis=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        return loss.mean()
+
+
+class SoftTargetCrossEntropyLoss(nn.Layer):
+    """ cross entropy loss for soft target
+    Args:
+        x: tensor, predictions (before softmax) with shape [N, num_classes]
+        target: tensor, soft target with shape [N, num_classes]
+    Returns:
+        loss: float, the mean loss value
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, target):
+        loss = paddle.sum(-target * F.log_softmax(x, axis=-1), axis=-1)
+        return loss.mean()
+
+
+class DistillationLoss(nn.Layer):
+    """Distillation loss function
+    This layer includes the orginal loss (criterion) and a extra 
+    distillation loss (criterion), which computes the loss with 
+    different type options, between current model and 
+    a teacher model as its supervision.
+
+    Args:
+        base_criterion: nn.Layer, the original criterion
+        teacher_model: nn.Layer, the teacher model as supervision
+        distillation_type: str, one of ['none', 'soft', 'hard']
+        alpha: float, ratio of base loss (* (1-alpha)) 
+               and distillation loss( * alpha)
+        tao: float, temperature in distillation
+    """
+    def __init__(self,
+                 base_criterion,
+                 teacher_model,
+                 distillation_type,
+                 alpha,
+                 tau):
+        super().__init__()
+        assert distillation_type in ['none', 'soft', 'hard']
+        self.base_criterion = base_criterion
+        self.teacher_model = teacher_model
+        self.type = distillation_type
+        self.alpha = alpha
+        self.tau = tau
+
+    def forward(self, inputs, outputs, targets):
+        """
+        Args:
+            inputs: tensor, the orginal model inputs
+            outputs: tensor, the outputs of the model
+            outputds_kd: tensor, the distillation outputs of the model,
+                         this is usually obtained by a separate branch
+                         in the last layer of the model
+            targets: tensor, the labels for the base criterion
+        """
+        outputs, outputs_kd = outputs[0], outputs[1]
+        base_loss = self.base_criterion(outputs, targets)
+        if self.type == 'none':
+            return base_loss
+
+        with paddle.no_grad():
+            teacher_outputs = self.teacher_model(inputs)
+
+        if self.type == 'soft':
+            distillation_loss = F.kl_div(
+                F.log_softmax(outputs_kd / self.tau, axis=1),
+                F.log_softmax(teacher_outputs / self.tau, axis=1),
+                reduction='sum') * (self.tau * self.tau) / outputs_kd.numel()
+        elif self.type == 'hard':
+            distillation_loss = F.cross_entropy(outputs_kd, teacher_outputs.argmax(axis=1))
+
+        loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha
+        return loss
+
+
diff --git a/image_classification/DeiT/main_eval_regnet_multi_gpu.py b/image_classification/DeiT/main_eval_regnet_multi_gpu.py
new file mode 100644
index 00000000..5de3dc52
--- /dev/null
+++ b/image_classification/DeiT/main_eval_regnet_multi_gpu.py
@@ -0,0 +1,183 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""RegNet validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader, get_dataset
+from regnet import build_regnet as build_model
+from utils import AverageMeter
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('RegNet')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, "+
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model()
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_val = args[0]
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. define val criterion
+    val_criterion = nn.CrossEntropyLoss()
+    # 4. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+    # 5. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=val_criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+
+
+def main():
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/DeiT/main_multi_gpu.py b/image_classification/DeiT/main_multi_gpu.py
new file mode 100644
index 00000000..4e59321b
--- /dev/null
+++ b/image_classification/DeiT/main_multi_gpu.py
@@ -0,0 +1,439 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DeiT training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader
+from datasets import get_dataset
+from deit import build_deit as build_model
+from regnet import build_regnet as build_teacher_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+from mixup import Mixup
+from losses import LabelSmoothingCrossEntropyLoss
+from losses import SoftTargetCrossEntropyLoss
+from losses import DistillationLoss
+from model_ema import ModelEma
+
+
+parser = argparse.ArgumentParser('DeiT')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-teacher_model', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1,
+          model_ema=None,
+          mixup_fn=None):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+        model_ema: ModelEma, model moving average instance
+        mixup_fn: Mixup, mixup instance
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+        label_orig = label.clone()
+
+        if mixup_fn is not None:
+            image, label = mixup_fn(image, label_orig)
+
+        output = model(image)
+        loss = criterion(image, output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        if model_ema is not None and paddle.distributed.get_rank() == 0:
+            model_ema.update(model)
+
+        # average of output and kd_output, like model eval mode
+        pred = F.softmax((output[0] + output[1]) / 2)
+        if mixup_fn:
+            acc = paddle.metric.accuracy(pred, label_orig)
+        else:
+            acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, "+
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    # 8. Define model ema
+    model_ema = None
+    if not config.EVAL: # only apply when training
+        if config.TRAIN.MODEL_EMA and local_rank == 0:
+            model_ema = ModelEma(model, decay=config.TRAIN.MODEL_EMA_DECAY)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+
+    # 3. Define mixup function
+    mixup_fn = None
+    if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None:
+        mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA,
+                         cutmix_alpha=config.TRAIN.CUTMIX_ALPHA,
+                         cutmix_minmax=config.TRAIN.CUTMIX_MINMAX,
+                         prob=config.TRAIN.MIXUP_PROB,
+                         switch_prob=config.TRAIN.MIXUP_SWITCH_PROB,
+                         mode=config.TRAIN.MIXUP_MODE,
+                         label_smoothing=config.TRAIN.SMOOTHING)
+    # 4. Define criterion
+    if config.TRAIN.MIXUP_PROB > 0.:
+        criterion = SoftTargetCrossEntropyLoss()
+    elif config.TRAIN.SMOOTHING:
+        criterion = LabelSmoothingCrossEntropyLoss()
+    else:
+        criterion = nn.CrossEntropyLoss()
+
+    val_criterion = nn.CrossEntropyLoss()
+
+    # 5. Create Teacher model
+    teacher_model = None
+    if not config.EVAL:
+        if config.TRAIN.DISTILLATION_TYPE != 'none':
+            logging.info(f'Creating teacher model: {config.TRAIN.TEACHER_MODEL}')
+            teacher_model = build_teacher_model()
+            assert os.path.isfile(config.TRAIN.TEACHER_MODEL + '.pdparams')
+            teacher_model_state = paddle.load(config.TRAIN.TEACHER_MODEL + '.pdparams')
+            teacher_model.set_dict(teacher_model_state)
+            teacher_model.eval()
+            logger.info(f"----- Load teacher model state from {config.TRAIN.TEACHER_MODEL}")
+        # wrap the criterion:
+        criterion = DistillationLoss(criterion,
+                                     teacher_model,
+                                     config.TRAIN.DISTILLATION_TYPE,
+                                     config.TRAIN.DISTILLATION_ALPHA,
+                                     config.TRAIN.DISTILLATION_TAU)
+
+    # 6. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 7. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+        # load ema model
+        if model_ema is not None and os.path.isfile(config.MODEL.RESUME_EMA+'.pdparams'):
+            model_ema_state = paddle.load(config.MODEL.RESUME_EMA+'.pdparams')
+            model_ema.set_dict(model_ema_state)
+            logger.info(f"----- Load model ema from {config.MODEL.RESUME_EMA}")
+    # 8. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=val_criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  model_ema=model_ema,
+                                                  mixup_fn=mixup_fn)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=val_criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+                if model_ema is not None:
+                    model_ema_path = os.path.join(
+                        config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}-EMA")
+                    paddle.save(model_ema.state_dict(), model_ema_path + '.pdparams')
+                    logger.info(f"----- Save ema model: {model_ema_path}.pdparams")
+
+
+def main():
+    # Build dataset
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/DeiT/main_single_gpu.py b/image_classification/DeiT/main_single_gpu.py
new file mode 100644
index 00000000..0834dc11
--- /dev/null
+++ b/image_classification/DeiT/main_single_gpu.py
@@ -0,0 +1,405 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DeiT training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import copy
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from deit import build_deit as build_model
+from regnet import build_regnet as build_teacher_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+from mixup import Mixup
+from losses import LabelSmoothingCrossEntropyLoss
+from losses import SoftTargetCrossEntropyLoss
+from losses import DistillationLoss
+from model_ema import ModelEma
+
+
+parser = argparse.ArgumentParser('DeiT')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-teacher_model', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1,
+          model_ema=None,
+          mixup_fn=None):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+        mode_ema: ModelEma, model moving average instance
+        mixup_fn: Mixup, mixup instance
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+        label_orig = label.clone()
+
+        if mixup_fn is not None:
+            image, label = mixup_fn(image, label_orig)
+
+        output = model(image)
+        loss = criterion(image, output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        if model_ema is not None:
+            model_ema.update(model)
+
+        # average of output and kd_output, like model eval mode
+        pred = F.softmax((output[0] + output[1]) / 2)
+        if mixup_fn:
+            acc = paddle.metric.accuracy(pred, label_orig)
+        else:
+            acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+    # 1. Create model
+    model = build_model(config)
+    # 2. Define model ema
+    model_ema = None
+    if not config.EVAL:# only apply ema when training
+        if config.TRAIN.MODEL_EMA:
+            model_ema = ModelEma(model, decay=config.TRAIN.MODEL_EMA_DECAY)
+    # 3. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+    # 4. Define mixup function
+    mixup_fn = None
+    if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None:
+        mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA,
+                         cutmix_alpha=config.TRAIN.CUTMIX_ALPHA,
+                         cutmix_minmax=config.TRAIN.CUTMIX_MINMAX,
+                         prob=config.TRAIN.MIXUP_PROB,
+                         switch_prob=config.TRAIN.MIXUP_SWITCH_PROB,
+                         mode=config.TRAIN.MIXUP_MODE,
+                         label_smoothing=config.TRAIN.SMOOTHING)
+    # 5. Define criterion
+    if config.TRAIN.MIXUP_PROB > 0.:
+        criterion = SoftTargetCrossEntropyLoss()
+    elif config.TRAIN.SMOOTHING:
+        criterion = LabelSmoothingCrossEntropyLoss()
+    else:
+        criterion = nn.CrossEntropyLoss()
+    # only use cross entropy for val 
+    val_criterion = nn.CrossEntropyLoss()
+    # 6. Create Teacher model
+    teacher_model = None
+    if not config.EVAL:
+        if config.TRAIN.DISTILLATION_TYPE != 'none':
+            logging.info(f'Creating teacher model: {config.TRAIN.TEACHER_MODEL}')
+            teacher_model = build_teacher_model() 
+            assert os.path.isfile(config.TRAIN.TEACHER_MODEL + '.pdparams')
+            teacher_model_state = paddle.load(config.TRAIN.TEACHER_MODEL + '.pdparams')
+            teacher_model.set_dict(teacher_model_state)
+            teacher_model.eval()
+            logger.info(f"----- Load teacher model state from {config.TRAIN.TEACHER_MODEL}")
+        # wrap the criterion:
+        criterion = DistillationLoss(criterion,
+                                     teacher_model,
+                                     config.TRAIN.DISTILLATION_TYPE,
+                                     config.TRAIN.DISTILLATION_ALPHA,
+                                     config.TRAIN.DISTILLATION_TAU)
+    # 7. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 8. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+    # 9. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}")
+        if model_ema is not None and os.path.isfile(config.MODEL.RESUME_EMA+'.pdparams'):
+            model_ema_state = paddle.load(config.MODEL.RESUME_EMA+'.pdparams')
+            model_ema.set_dict(model_ema_state)
+            logger.info(f"----- Load model ema from {config.MODEL.RESUME_EMA}")
+
+    # 10. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=val_criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+    # 10. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  model_ema=model_ema,
+                                                  mixup_fn=mixup_fn)
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=val_criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+            # save model ema
+            if model_ema is not None:
+                model_ema_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}-EMA")
+                paddle.save(model_ema.state_dict(), model_ema_path + '.pdparams')
+                logger.info(f"----- Save ema model: {model_ema_path}.pdparams")
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/DeiT/mixup.py b/image_classification/DeiT/mixup.py
new file mode 100644
index 00000000..1d2db493
--- /dev/null
+++ b/image_classification/DeiT/mixup.py
@@ -0,0 +1,225 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""mixup and cutmix for batch data"""
+import numpy as np
+import paddle
+
+
+def rand_bbox(image_shape, lam, count=None):
+    """ CutMix bbox by lam value
+    Generate 1 random bbox by value lam. lam is the cut size rate.
+    The cut_size is computed by sqrt(1-lam) * image_size.
+
+    Args:
+        image_shape: tuple/list, image height and width
+        lam: float, cutmix lambda value
+        count: int, number of bbox to generate
+    """
+    image_h, image_w = image_shape[-2:]
+    cut_rate = np.sqrt(1. - lam)
+    cut_h = int(cut_rate * image_h)
+    cut_w = int(cut_rate * image_w)
+
+    # get random bbox center
+    cy = np.random.randint(0, image_h, size=count)
+    cx = np.random.randint(0, image_w, size=count)
+
+    # get bbox coords
+    bbox_x1 = np.clip(cx - cut_w // 2, 0, image_w)
+    bbox_y1 = np.clip(cy - cut_h // 2, 0, image_h)
+    bbox_x2 = np.clip(cx + cut_w // 2, 0, image_w)
+    bbox_y2 = np.clip(cy + cut_h // 2, 0, image_h)
+
+    # NOTE: in paddle, tensor indexing e.g., a[x1:x2],
+    # if x1 == x2, paddle will raise ValueErros, 
+    # while in pytorch, it will return [] tensor
+    return bbox_x1, bbox_y1, bbox_x2, bbox_y2
+
+
+def rand_bbox_minmax(image_shape, minmax, count=None):
+    """ CutMix bbox by min and max value
+    Generate 1 random bbox by min and max percentage values.
+    Minmax is a tuple/list of min and max percentage vlaues
+    applied to the image width and height.
+
+    Args:
+        image_shape: tuple/list, image height and width
+        minmax: tuple/list, min and max percentage values of image size
+        count: int, number of bbox to generate
+    """
+    assert len(minmax) == 2
+    image_h, image_w = image_shape[-2:]
+    min_ratio = minmax[0]
+    max_ratio = minmax[1]
+    cut_h = np.random.randint(int(image_h * min_ratio), int(image_h * max_ratio), size=count) 
+    cut_w = np.random.randint(int(image_w * min_ratio), int(image_w * max_ratio), size=count) 
+
+    bbox_x1 = np.random.randint(0, image_w - cut_w, size=count)
+    bbox_y1 = np.random.randint(0, image_h - cut_h, size=count)
+    bbox_x2 = bbox_x1 + cut_w
+    bbox_y2 = bbox_y1 + cut_h
+
+    return bbox_x1, bbox_y1, bbox_x2, bbox_y2
+
+
+def cutmix_generate_bbox_adjust_lam(image_shape, lam, minmax=None, correct_lam=True, count=None):
+    """Generate bbox and apply correction for lambda
+    If the mimmax is None, apply the standard cutmix by lam value,
+    If the minmax is set, apply the cutmix by min and max percentage values.
+
+    Args:
+        image_shape: tuple/list, image height and width
+        lam: float, cutmix lambda value
+        minmax: tuple/list, min and max percentage values of image size
+        correct_lam: bool, if True, correct the lam value by the generated bbox
+        count: int, number of bbox to generate
+    """
+    if minmax is not None:
+        bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox_minmax(image_shape, minmax, count)
+    else:
+        bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox(image_shape, lam, count)
+
+    if correct_lam or minmax is not None:
+        image_h, image_w = image_shape[-2:]
+        bbox_area = (bbox_y2 - bbox_y1) * (bbox_x2 - bbox_x1)
+        lam = 1. - bbox_area / float(image_h * image_w)
+    return (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam
+
+
+def one_hot(x, num_classes, on_value=1., off_value=0.):
+    """ Generate one-hot vector for label smoothing
+    Args:
+        x: tensor, contains label/class indices
+        num_classes: int, num of classes (len of the one-hot vector)
+        on_value: float, the vector value at label index, default=1.
+        off_value: float, the vector value at non-label indices, default=0.
+    Returns:
+        one_hot: tensor, tensor with on value at label index and off value
+                 at non-label indices.
+    """
+    x = x.reshape_([-1, 1])
+    x_smoothed = paddle.full((x.shape[0], num_classes), fill_value=off_value)
+    for i in range(x.shape[0]):
+        x_smoothed[i, x[i]] = on_value
+    return x_smoothed
+
+
+def mixup_one_hot(label, num_classes, lam=1., smoothing=0.):
+    """ mixup and label smoothing in batch
+    label smoothing is firstly applied, then
+    mixup is applied by mixing the bacth and its flip,
+    with a mixup rate.
+
+    Args:
+        label: tensor, label tensor with shape [N], contains the class indices
+        num_classes: int, num of all classes
+        lam: float, mixup rate, default=1.0
+        smoothing: float, label smoothing rate
+    """
+    off_value = smoothing / num_classes
+    on_value = 1. - smoothing + off_value
+    y1 = one_hot(label, num_classes, on_value, off_value)
+    y2 = one_hot(label.flip(axis=[0]), num_classes, on_value, off_value)
+    return y2 * (1 - lam) + y1 * lam
+
+
+class Mixup:
+    """Mixup class
+    Args:
+        mixup_alpha: float, mixup alpha for beta distribution, default=1.0,
+        cutmix_alpha: float, cutmix alpha for beta distribution, default=0.0,
+        cutmix_minmax: list/tuple, min and max value for cutmix ratio, default=None,
+        prob: float, if random prob < prob, do not use mixup, default=1.0,
+        switch_prob: float, prob of switching mixup and cutmix, default=0.5,
+        mode: string, mixup up, now only 'batch' is supported, default='batch',
+        correct_lam: bool, if True, apply correction of lam, default=True,
+        label_smoothing: float, label smoothing rate, default=0.1,
+        num_classes: int, num of classes, default=1000
+    """
+    def __init__(self,
+                 mixup_alpha=1.0,
+                 cutmix_alpha=0.0,
+                 cutmix_minmax=None,
+                 prob=1.0,
+                 switch_prob=0.5,
+                 mode='batch',
+                 correct_lam=True,
+                 label_smoothing=0.1,
+                 num_classes=1000):
+        self.mixup_alpha = mixup_alpha
+        self.cutmix_alpha = cutmix_alpha
+        self.cutmix_minmax = cutmix_minmax
+        if cutmix_minmax is not None:
+            assert len(cutmix_minmax) == 2
+            self.cutmix_alpha = 1.0
+        self.mix_prob = prob
+        self.switch_prob = switch_prob
+        self.label_smoothing = label_smoothing
+        self.num_classes = num_classes
+        self.mode = mode
+        self.correct_lam = correct_lam
+        assert mode == 'batch', 'Now only batch mode is supported!'
+
+    def __call__(self, x, target):
+        assert x.shape[0] % 2 == 0, "Batch size should be even"
+        lam = self._mix_batch(x)
+        target = mixup_one_hot(target, self.num_classes, lam, self.label_smoothing)
+        return x, target
+
+    def get_params(self):
+        """Decide to use cutmix or regular mixup by sampling and
+           sample lambda for mixup
+        """
+        lam = 1.
+        use_cutmix = False
+        use_mixup = np.random.rand() < self.mix_prob
+        if use_mixup:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand() < self.switch_prob
+                alpha = self.cutmix_alpha if use_cutmix else self.mixup_alpha
+                lam_mix = np.random.beta(alpha, alpha)
+            elif self.mixup_alpha == 0. and self.cutmix_alpha > 0.:
+                use_cutmix=True
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
+            elif self.mixup_alpha > 0. and self.cutmix_alpha == 0.:
+                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            else:
+                raise ValueError('mixup_alpha and cutmix_alpha cannot be all 0')
+            lam = float(lam_mix)
+        return lam, use_cutmix
+
+    def _mix_batch(self, x):
+        """mixup/cutmix by adding batch data and its flipped version"""
+        lam, use_cutmix = self.get_params()
+        if lam == 1.:
+            return lam
+        if use_cutmix:
+            (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam = cutmix_generate_bbox_adjust_lam(
+                x.shape,
+                lam,
+                minmax=self.cutmix_minmax,
+                correct_lam=self.correct_lam)
+
+            # NOTE: in paddle, tensor indexing e.g., a[x1:x2],
+            # if x1 == x2, paddle will raise ValueErros, 
+            # but in pytorch, it will return [] tensor without errors
+            if int(bbox_x1) != int(bbox_x2) and int(bbox_y1) != int(bbox_y2):
+                x[:, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)] = x.flip(axis=[0])[
+                    :, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)]
+        else:
+            x_flipped = x.flip(axis=[0])
+            x_flipped = x_flipped * (1 - lam)
+            x.set_value(x * (lam) + x_flipped)
+        return lam
diff --git a/image_classification/DeiT/model_ema.py b/image_classification/DeiT/model_ema.py
new file mode 100644
index 00000000..389ab685
--- /dev/null
+++ b/image_classification/DeiT/model_ema.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Implement the Exponential Model Averaging
+This is paddle hack from:
+https://github.com/rwightman/pytorch-image-models/blob/master/timm/utils/model_ema.py
+"""
+
+import copy
+from collections import OrderedDict
+import paddle
+import paddle.nn as nn
+
+
+class ModelEma:
+    """Model Ema
+    A moving average is kept of model weights and buffers.
+    Note that for multiple gpu, ema must be defined after mode init,
+    but before DataParallel.
+
+    Args:
+        model: nn.Layer, original modela with learnable params
+        decay: float, decay rate for each update, default: 0.999
+    """
+    def __init__(self, model, decay=0.999):
+        self.module = copy.deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+
+    @paddle.no_grad()
+    def _update(self, model, update_fn):
+        # update ema model parameters by model parameters
+        for (_, ema_param), (_, model_param) in zip(
+            self.module.named_parameters(), model.named_parameters()):
+            ema_param.set_value(copy.deepcopy(update_fn(ema_param, model_param)))
+            
+        # update ema model buffers by model buffers
+        for (_, ema_buf), (_, model_buf) in zip(
+            self.module.named_buffers(), model.named_buffers()):
+            ema_buf.set_value(copy.deepcopy(update_fn(ema_buf, model_buf)))
+
+    def update(self, model):
+        self._update(model, update_fn=lambda e, m: self.decay * e  + (1 - self.decay) * m)
+
+    def set(self, model):
+        self._update(model, update_fn=lambda e, m: m)
+
diff --git a/image_classification/DeiT/port_weights/load_pytorch_weights_base_patch16_224.py b/image_classification/DeiT/port_weights/load_pytorch_weights_base_patch16_224.py
new file mode 100644
index 00000000..975baed3
--- /dev/null
+++ b/image_classification/DeiT/port_weights/load_pytorch_weights_base_patch16_224.py
@@ -0,0 +1,176 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from deit import *
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/deit_base_patch16_224.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-teacher_model', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'class_token'),
+        ('dist_token', 'distill_token'),
+        ('pos_embed', 'pos_embed'),
+        ('patch_embed.proj', f'patch_embed.proj'),
+    ]
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'layers.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+        ('head_dist', 'head_distill')
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_deit(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = torch.hub.load('facebookresearch/deit:main',
+                                 'deit_base_distilled_patch16_224',
+                                 pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./deit_base_distilled_patch16_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/DeiT/port_weights/load_pytorch_weights_base_patch16_384.py b/image_classification/DeiT/port_weights/load_pytorch_weights_base_patch16_384.py
new file mode 100644
index 00000000..e995afde
--- /dev/null
+++ b/image_classification/DeiT/port_weights/load_pytorch_weights_base_patch16_384.py
@@ -0,0 +1,176 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from deit import *
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/deit_base_patch16_384.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-teacher_model', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'class_token'),
+        ('dist_token', 'distill_token'),
+        ('pos_embed', 'pos_embed'),
+        ('patch_embed.proj', f'patch_embed.proj'),
+    ]
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'layers.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+        ('head_dist', 'head_distill')
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_deit(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = torch.hub.load('facebookresearch/deit:main',
+                                 'deit_base_distilled_patch16_384',
+                                 pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 384, 384).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./deit_base_distilled_patch16_384.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/DeiT/port_weights/load_pytorch_weights_regnety_160.py b/image_classification/DeiT/port_weights/load_pytorch_weights_regnety_160.py
new file mode 100644
index 00000000..3bbc1560
--- /dev/null
+++ b/image_classification/DeiT/port_weights/load_pytorch_weights_regnety_160.py
@@ -0,0 +1,170 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import numpy as np
+import paddle
+import torch
+import timm
+from regnet import *
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stem.conv.weight', 'stem.0.weight'),
+        ('stem.bn', 'stem.1'),
+    ]
+
+    depths = [2, 4, 11, 1]
+    for idx in range(len(depths)):
+        for block_idx in range(depths[idx]):
+            th_prefix = f's{idx+1}.b{block_idx+1}'
+            pp_prefix = f'stages.{idx}.blocks.{block_idx}'
+
+            layer_mapping = [
+                (f'{th_prefix}.conv1.conv', f'{pp_prefix}.conv1'),
+                (f'{th_prefix}.conv1.bn', f'{pp_prefix}.bn1'),
+                (f'{th_prefix}.conv2.conv', f'{pp_prefix}.conv2'),
+                (f'{th_prefix}.conv2.bn', f'{pp_prefix}.bn2'),
+                (f'{th_prefix}.se.fc1', f'{pp_prefix}.se.conv1_1x1'),
+                (f'{th_prefix}.se.fc2', f'{pp_prefix}.se.conv2_1x1'),
+                (f'{th_prefix}.downsample.conv', f'{pp_prefix}.downsample.conv1x1'),
+                (f'{th_prefix}.downsample.bn', f'{pp_prefix}.downsample.bn'),
+                (f'{th_prefix}.conv3.conv', f'{pp_prefix}.conv3'),
+                (f'{th_prefix}.conv3.bn', f'{pp_prefix}.bn3'),
+            ]
+            mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('head.fc', 'head.2'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            if f'{th_name}.weight' in th_params.keys():
+                th_name_w = f'{th_name}.weight'
+                pd_name_w = f'{pd_name}.weight'
+                _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+            if f'{th_name}.running_mean' in th_params.keys():
+                th_name_b = f'{th_name}.running_mean'
+                pd_name_b = f'{pd_name}._mean'
+                _set_value(th_name_b, pd_name_b)
+
+            if f'{th_name}.running_var' in th_params.keys():
+                th_name_b = f'{th_name}.running_var'
+                pd_name_b = f'{pd_name}._variance'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_regnet()
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = timm.create_model('regnety_160', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 288, 288).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    print(torch_model)
+    out_torch = torch_model(x_torch)
+    print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./regnety_160.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/DeiT/random_erasing.py b/image_classification/DeiT/random_erasing.py
new file mode 100644
index 00000000..1252f85d
--- /dev/null
+++ b/image_classification/DeiT/random_erasing.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Random Erasing for image tensor"""
+
+import random
+import math
+import paddle
+
+
+def _get_pixels(per_pixel, rand_color, patch_size, dtype="float32"):
+    if per_pixel:
+        return paddle.normal(shape=patch_size).astype(dtype)
+    elif rand_color:
+        return paddle.normal(shape=(patch_size[0], 1, 1)).astype(dtype)
+    else:
+        return paddle.zeros((patch_size[0], 1, 1)).astype(dtype)
+
+
+class RandomErasing(object):
+    """
+    Args:
+        prob: probability of performing random erasing
+        min_area: Minimum percentage of erased area wrt input image area
+        max_area: Maximum percentage of erased area wrt input image area
+        min_aspect: Minimum aspect ratio of earsed area
+        max_aspect: Maximum aspect ratio of earsed area
+        mode: pixel color mode, in ['const', 'rand', 'pixel']
+            'const' - erase block is constant valued 0 for all channels
+            'rand'  - erase block is valued random color (same per-channel)  
+            'pixel' - erase block is vauled random color per pixel
+        min_count: Minimum # of ereasing blocks per image.
+        max_count: Maximum # of ereasing blocks per image. Area per box is scaled by count
+                   per-image count is randomly chosen between min_count to max_count
+    """
+    def __init__(self, prob=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None,
+                 mode='const', min_count=1, max_count=None, num_splits=0):
+        self.prob = prob
+        self.min_area = min_area
+        self.max_area = max_area
+        max_aspect = max_aspect or 1 / min_aspect
+        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+        self.min_count = min_count
+        self.max_count = max_count or min_count
+        self.num_splits = num_splits
+        mode = mode.lower()
+        self.rand_color = False
+        self.per_pixel = False
+        if mode == "rand":
+            self.rand_color = True
+        elif mode == "pixel":
+            self.per_pixel = True
+        else:
+            assert not mode or mode == "const"
+
+    def _erase(self, img, chan, img_h, img_w, dtype):
+        if random.random() > self.prob:
+            return
+        area = img_h * img_w
+        count = self.min_count if self.min_count == self.max_count else \
+            random.randint(self.min_count, self.max_count)
+        for _ in range(count):
+            for attempt in range(10):
+                target_area = random.uniform(self.min_area, self.max_area) * area / count
+                aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+                h = int(round(math.sqrt(target_area * aspect_ratio)))
+                w = int(round(math.sqrt(target_area / aspect_ratio)))
+                if w < img_w and h < img_h:
+                    top = random.randint(0, img_h - h)
+                    left = random.randint(0, img_w - w)
+                    img[:, top:top+h, left:left+w] = _get_pixels(
+                                self.per_pixel, self.rand_color, (chan, h, w),
+                                dtype=dtype)
+                    break
+    
+    def __call__(self, input):
+        if len(input.shape) == 3:
+            self._erase(input, *input.shape, input.dtype)
+        else:
+            batch_size, chan, img_h, img_w = input.shape
+            batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0
+            for i in range(batch_start, batch_size):
+                self._erase(input[i], chan, img_h, img_w, input.dtype)
+        return input
+
+
+
+#def main():
+#    re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='rand')
+#    #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='const')
+#    #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='pixel')
+#    import PIL.Image as Image
+#    import numpy as np
+#    paddle.set_device('cpu')
+#    img = paddle.to_tensor(np.asarray(Image.open('./lenna.png'))).astype('float32')
+#    img = img / 255.0
+#    img = paddle.transpose(img, [2, 0, 1])
+#    new_img = re(img)
+#    new_img = new_img * 255.0
+#    new_img = paddle.transpose(new_img, [1, 2, 0])
+#    new_img = new_img.cpu().numpy()
+#    new_img = Image.fromarray(new_img.astype('uint8'))
+#    new_img.save('./res.png')
+#
+#
+#
+#if __name__ == "__main__":
+#    main()
diff --git a/image_classification/DeiT/regnet.py b/image_classification/DeiT/regnet.py
new file mode 100644
index 00000000..8edd95fb
--- /dev/null
+++ b/image_classification/DeiT/regnet.py
@@ -0,0 +1,261 @@
+import numpy as np
+import copy
+import paddle
+import paddle.nn as nn
+"""RegNet y-160
+This is a simple version of regnet which only implements RegNetY-160.
+This model is used as the teacher model for DeiT.
+"""
+
+class Identity(nn.Layer):
+    """ Identity Layer """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x
+
+
+class SE(nn.Layer):
+    """ Squeeze and Excitation module"""
+    def __init__(self, in_channels, rd_channels, se_ratio=.25):
+        super().__init__()
+        if rd_channels is None:
+            out_channels = int(in_channels * se_ratio)
+        else:
+            out_channels = rd_channels
+        self.avgpool = nn.AdaptiveAvgPool2D(output_size=1)
+        self.conv1_1x1 = nn.Conv2D(in_channels, out_channels, kernel_size=1)
+        self.conv2_1x1 = nn.Conv2D(out_channels, in_channels, kernel_size=1)
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        out = self.avgpool(x)
+        out = self.conv1_1x1(out)
+        out = self.relu(out)
+        out = self.conv2_1x1(out)
+        out = self.sigmoid(out)
+        out = x * out
+        return out
+
+
+class Downsample(nn.Layer):
+    """Downsample for 1st bottleneck block in every layer in RegNet"""
+    def __init__(self, in_channels, out_channels, stride):
+        super().__init__()
+        self.conv1x1 = nn.Conv2D(in_channels,
+                                 out_channels,
+                                 kernel_size=1,
+                                 stride=stride,
+                                 bias_attr=False)
+        self.bn = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x):
+        out = self.conv1x1(x)
+        out = self.bn(out)
+        return out
+
+
+class Bottleneck(nn.Layer):
+    """Bottleneck residual block in Stage"""
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 bottleneck_ratio=1,
+                 group_width=1,
+                 stride=1,
+                 dilation=1,
+                 se_ratio=0.25):
+        super().__init__()
+        # 1x1 bottleneck conv block
+        bottleneck_channels = int(round(out_channels * bottleneck_ratio))
+        self.conv1 = nn.Conv2D(in_channels, bottleneck_channels, 1, bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(bottleneck_channels)
+        # 3x3 conv block with group conv
+        groups = bottleneck_channels // group_width
+        self.conv2 = nn.Conv2D(bottleneck_channels,
+                               bottleneck_channels,
+                               kernel_size=3,
+                               stride=stride,
+                               dilation=dilation,
+                               padding=1,
+                               groups=groups,
+                               bias_attr=False)
+        self.bn2 = nn.BatchNorm2D(bottleneck_channels)
+        # SE modual
+        if se_ratio:
+            self.se = SE(bottleneck_channels, rd_channels=int(round(in_channels * se_ratio)))
+        else:
+            se_ratio = Identity()
+        # downsample if stride = 2
+        if stride != 1 or in_channels != out_channels:
+            self.downsample = Downsample(in_channels, out_channels, stride)
+        else:
+            self.downsample = Identity()
+        # 1x1 conv block
+        self.conv3 = nn.Conv2D(bottleneck_channels,
+                               out_channels,
+                               kernel_size=1)
+        self.bn3 = nn.BatchNorm2D(out_channels)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        h = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.se(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        h = self.downsample(h)
+
+        out = out + h
+        out = self.relu(out)
+        return out
+
+
+class RegStage(nn.Layer):
+    """ Sequence of blocks with the same output shape"""
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 depth,
+                 bottleneck_ratio,
+                 group_width,
+                 se_ratio=0.25):
+        super().__init__()
+
+        self.blocks = nn.LayerList()
+        for i in range(depth):
+            block_stride = 2 if i == 0 else 1
+            block_in_channels = in_channels if i == 0 else out_channels
+            self.blocks.append(
+                copy.deepcopy(Bottleneck(block_in_channels,
+                                         out_channels,
+                                         bottleneck_ratio,
+                                         group_width,
+                                         block_stride,
+                                         se_ratio=se_ratio)))
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        return x
+
+
+class RegNet(nn.Layer):
+    """RegNet Model"""
+    def __init__(self, cfg):
+        super().__init__()
+        num_classes = cfg['num_classes']
+        stem_width = cfg['stem_width']
+
+        # Stem layers
+        self.stem = nn.Sequential(
+            nn.Conv2D(in_channels=3,
+                      out_channels=stem_width,
+                      kernel_size=3,
+                      stride=2,
+                      padding=1,
+                      bias_attr=False),
+            nn.BatchNorm2D(stem_width),
+            nn.ReLU())
+        # RegStages
+        self.stages = nn.LayerList()
+        prev_width = stem_width
+        curr_stride = 2
+        stage_params = self._get_stage_params(cfg)
+        for i, stage_param in enumerate(stage_params):
+            self.stages.append(
+                copy.deepcopy(RegStage(in_channels=prev_width,
+                                       out_channels=stage_param['out_channels'],
+                                       depth=stage_param['depth'],
+                                       bottleneck_ratio=stage_param['bottle_ratio'],
+                                       group_width=stage_param['group_width'],
+                                       se_ratio=stage_param['se_ratio'])))
+            prev_width = stage_param['out_channels']
+        # Head
+        num_features = prev_width
+        self.head = nn.Sequential(nn.AdaptiveAvgPool2D(output_size=1),
+                                  nn.Flatten(),
+                                  nn.Linear(num_features, num_classes))
+
+    def _get_stage_params(self, cfg):
+        w_init = cfg['w0']
+        w_slope = cfg['wa']
+        w_mult = cfg['wm']
+        depth = cfg['depth']
+        se_ratio = cfg['se_ratio']
+        group_w = cfg['group_w']
+        bottle_ratio = cfg['bottle_ratio']
+
+        w, d = self._generate_regnet(w_slope, w_init, w_mult, depth, bottle_ratio, group_w)
+
+        num_stages = len(w)
+        stage_widths = w
+        stage_depths = d
+        stage_bottle_ratios = [bottle_ratio for _ in range(num_stages)]
+        stage_groups = [group_w for _ in range(num_stages)]
+        se_ratios = [se_ratio for _ in range(num_stages)]
+        param_names = ['out_channels', 'depth', 'bottle_ratio', 'group_width','se_ratio']
+        stage_params = [
+            dict(zip(param_names, params)) for params in zip(stage_widths,
+                                                             stage_depths,
+                                                             stage_bottle_ratios,
+                                                             stage_groups,
+                                                             se_ratios)]
+        return stage_params 
+
+    def _generate_regnet(self, w_slope, w_init, w_mult, depth, b=1, g=8):
+        """Generate per block widths from RegNet parameters"""
+        w_count = w_init + w_slope * np.arange(depth) # Equation 1
+        w_exps = np.round(np.log(w_count / w_init) / np.log(w_mult)) # Equation 2
+        
+        w = w_init * np.power(w_mult, w_exps) # Equation 3
+        w = np.round(np.divide(w, 8)) * 8 # make all width list divisible by 8
+
+        w, d = np.unique(w.astype(int), return_counts=True) # find depth and width list
+
+        gtemp = np.minimum(g, w//b)
+        w = (np.round(w // b / gtemp) * gtemp).astype(int) # width
+
+        return w, d
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        for stage in self.stages:
+            x = stage(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+            
+def build_regnet():
+    """build regnet model using dict as config"""
+    regnety_160 = {
+        'stem_width': 32,
+        'bottle_ratio': 1.0,
+        'w0': 200,
+        'wa': 106.23,
+        'wm': 2.48,
+        'group_w': 112,
+        'depth': 18,
+        'se_ratio': 0.25,
+        'num_classes': 1000,
+        'pool_size': (7, 7),
+        'crop_pct': 0.875,
+    }
+    model = RegNet(regnety_160)
+    return model
diff --git a/image_classification/DeiT/run_eval.sh b/image_classification/DeiT/run_eval.sh
new file mode 100644
index 00000000..7ac58802
--- /dev/null
+++ b/image_classification/DeiT/run_eval.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/deit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=16 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./deit_base_distilled_patch16_224' \
diff --git a/image_classification/DeiT/run_eval_multi.sh b/image_classification/DeiT/run_eval_multi.sh
new file mode 100644
index 00000000..fc36cb0c
--- /dev/null
+++ b/image_classification/DeiT/run_eval_multi.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/deit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=16 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./deit_base_distilled_patch16_224' \
+-ngpus=4
diff --git a/image_classification/DeiT/run_eval_multi_224.sh b/image_classification/DeiT/run_eval_multi_224.sh
new file mode 100644
index 00000000..0899d98f
--- /dev/null
+++ b/image_classification/DeiT/run_eval_multi_224.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/deit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=4 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./deit_base_distilled_patch16_224' \
+-ngpus=4
diff --git a/image_classification/DeiT/run_eval_multi_384.sh b/image_classification/DeiT/run_eval_multi_384.sh
new file mode 100644
index 00000000..43b05f2d
--- /dev/null
+++ b/image_classification/DeiT/run_eval_multi_384.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/deit_base_patch16_384.yaml' \
+-dataset='imagenet2012' \
+-batch_size=4 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./deit_base_distilled_patch16_384' \
+-ngpus=4
diff --git a/image_classification/DeiT/run_eval_regnet.sh b/image_classification/DeiT/run_eval_regnet.sh
new file mode 100644
index 00000000..5290b3ee
--- /dev/null
+++ b/image_classification/DeiT/run_eval_regnet.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_eval_regnet_multi_gpu.py \
+-cfg='./configs/regnety_160.yaml' \
+-dataset='imagenet2012' \
+-batch_size=4 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./regnety_160' \
+-ngpus=4
diff --git a/image_classification/DeiT/run_train.sh b/image_classification/DeiT/run_train.sh
new file mode 100644
index 00000000..8452dd92
--- /dev/null
+++ b/image_classification/DeiT/run_train.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/deit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=4 \
+-data_path='/dataset/imagenet' \
+-teacher_model='./regnety_160'
+#-pretrained='./deit_base_distilled_patch16_224'
diff --git a/image_classification/DeiT/run_train_multi.sh b/image_classification/DeiT/run_train_multi.sh
new file mode 100644
index 00000000..7ce3a4ab
--- /dev/null
+++ b/image_classification/DeiT/run_train_multi.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/deit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=4 \
+-data_path='/dataset/imagenet' \
+-teacher_model='./regnety_160'
+#-pretrained='./deit_base_distilled_patch16_224'
diff --git a/image_classification/DeiT/tests/__init__.py b/image_classification/DeiT/tests/__init__.py
new file mode 100644
index 00000000..a6131c10
--- /dev/null
+++ b/image_classification/DeiT/tests/__init__.py
@@ -0,0 +1 @@
+# init
diff --git a/image_classification/DeiT/tests/test_auto_augment.py b/image_classification/DeiT/tests/test_auto_augment.py
new file mode 100644
index 00000000..165d60e0
--- /dev/null
+++ b/image_classification/DeiT/tests/test_auto_augment.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from PIL import Image
+from auto_augment import *
+
+
+class AutoAugmentTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        #cls.img = Image.open('./lena.png')
+        pass
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    @unittest.skip('skip for debug')
+    def test_shear_x(self):
+        img = AutoAugmentTest.img
+        img = shear_x(img, 0.3)
+        img.save('lena_shear_x.png')
+
+    @unittest.skip('skip for debug')
+    def test_shear_y(self):
+        img = AutoAugmentTest.img
+        img = shear_y(img, 0.3)
+        img.save('lena_shear_y_0.3.png')
+
+    @unittest.skip('skip for debug')
+    def test_translate_x_relative(self):
+        img = AutoAugmentTest.img
+        img = translate_x_relative(img, 0.25)
+        img.save('lena_translate_x_r_0.25.png')
+
+    @unittest.skip('skip for debug')
+    def test_translate_y_relative(self):
+        img = AutoAugmentTest.img
+        img = translate_y_relative(img, 0.25)
+        img.save('lena_translate_y_r_0.25.png')
+
+    @unittest.skip('skip for debug')
+    def test_translate_x_absolute(self):
+        img = AutoAugmentTest.img
+        img = translate_x_absolute(img, 150)
+        img.save('lena_absolute_x_r_150.png')
+
+    @unittest.skip('skip for debug')
+    def test_translate_y_absolute(self):
+        img = AutoAugmentTest.img
+        img = translate_y_absolute(img, 150)
+        img.save('lena_absolute_y_r_150.png')
+
+    @unittest.skip('skip for debug')
+    def test_rotate(self):
+        img = AutoAugmentTest.img
+        img = rotate(img, 30)
+        img.save('lena_rotate_30.png')
+
+    @unittest.skip('skip for debug')
+    def test_auto_contrast(self):
+        img = AutoAugmentTest.img
+        img = auto_contrast(img)
+        img.save('lena_auto_contrast.png')
+
+    @unittest.skip('skip for debug')
+    def test_invert(self):
+        img = AutoAugmentTest.img
+        img = invert(img)
+        img.save('lena_invert_30.png')
+
+    @unittest.skip('skip for debug')
+    def test_equalize(self):
+        img = AutoAugmentTest.img
+        img = equalize(img)
+        img.save('lena_equalize.png')
+
+    @unittest.skip('skip for debug')
+    def test_solarize(self):
+        img = AutoAugmentTest.img
+        img = solarize(img, 50)
+        img.save('lena_solarize_50.png')
+
+    @unittest.skip('skip for debug')
+    def test_posterize(self):
+        img = AutoAugmentTest.img
+        img = posterize(img, 8)
+        img.save('lena_posterize_8.png')
+
+    @unittest.skip('skip for debug')
+    def test_contrast(self):
+        img = AutoAugmentTest.img
+        img = contrast(img, 1.5)
+        img.save('lena_contrast_1.5.png')
+
+    @unittest.skip('skip for debug')
+    def test_color(self):
+        img = AutoAugmentTest.img
+        img = color(img, 1.5)
+        img.save('lena_color_1.5.png')
+
+    @unittest.skip('skip for debug')
+    def test_brightness(self):
+        img = AutoAugmentTest.img
+        img = brightness(img, 1.5)
+        img.save('lena_brightness_1.5.png')
+
+    @unittest.skip('skip for debug')
+    def test_sharpness(self):
+        img = AutoAugmentTest.img
+        img = sharpness(img, 1.5)
+        img.save('lena_sharpness_1.5.png')
+
+    @unittest.skip('skip for debug')
+    def test_subpolicy(self):
+        img = AutoAugmentTest.img
+        sub = SubPolicy('ShearX', 1.0, 3)
+        img = sub(img)
+        img.save('lena_subpolicy.png')
+
+    @unittest.skip('skip for debug')
+    def test_auto_augment(self):
+        img = AutoAugmentTest.img
+        for i in range(10):
+            policy = auto_augment_policy_original()
+            aa = AutoAugment(policy)
+            img = aa(img)
+            img.save(f'lena_aa_{i}.png')
+
+    
diff --git a/image_classification/DeiT/tests/test_dataset.py b/image_classification/DeiT/tests/test_dataset.py
new file mode 100644
index 00000000..5202f3c7
--- /dev/null
+++ b/image_classification/DeiT/tests/test_dataset.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+from config import *
+from datasets import *
+
+
+class DatasetTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    #@unittest.skip('skip for debug')
+    def test_dataset(self):
+        config = get_config()
+        transforms = get_train_transforms(config)
+        dataset = ImageNet2012Dataset(file_folder='/dataset/imagenet/', mode='train', transform=transforms)
+        for idx, (data, label) in enumerate(dataset):
+            self.assertEqual([3, 224, 224], data.shape)
+            if idx == 10:
+                return
+
+
+
diff --git a/image_classification/DeiT/tests/test_ema.py b/image_classification/DeiT/tests/test_ema.py
new file mode 100644
index 00000000..cb6f3257
--- /dev/null
+++ b/image_classification/DeiT/tests/test_ema.py
@@ -0,0 +1,66 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import copy
+import numpy as np
+import paddle
+import paddle.nn as nn
+from model_ema import ModelEma
+
+
+class DummyModel(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.layer = nn.Linear(4, 8)
+        self.head = nn.Linear(8, 5)
+
+    def forward(self, x):
+        feature = self.layer(x)
+        out = self.head(feature)
+        return out
+
+
+class ModelEmaTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    #@unittest.skip('skip for debug')
+    def test_model_ema(self):
+        model = DummyModel()
+        criterion = nn.CrossEntropyLoss()
+        optim = paddle.optimizer.SGD(learning_rate=0.1,
+                                       parameters=model.parameters())
+        model_ema = ModelEma(model, decay=0.5)
+        prev_weight = copy.deepcopy(model_ema.module.head.weight)
+        for i in range(5):
+            x = paddle.rand([4, 4])
+            target = paddle.randint(0, 5, [4])
+            out = model(x)
+            loss = criterion(out, target)
+            loss.backward()
+            optim.step()
+            optim.clear_grad()
+            model_ema.update(model)
+            # test model ema update
+            model_ema_weight = model_ema.module.head.weight
+            self.assertFalse(np.allclose(prev_weight.numpy(), model_ema_weight.numpy(), atol=1e-5))
+            prev_weight = copy.deepcopy(model_ema.module.head.weight)
+
+            
diff --git a/image_classification/DeiT/tests/test_losses.py b/image_classification/DeiT/tests/test_losses.py
new file mode 100644
index 00000000..3d8fab4c
--- /dev/null
+++ b/image_classification/DeiT/tests/test_losses.py
@@ -0,0 +1,124 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+from losses import SoftTargetCrossEntropyLoss
+from losses import LabelSmoothingCrossEntropyLoss
+from losses import DistillationLoss
+
+
+class DummyModel(nn.Layer):
+    def __init__(self, kd=False):
+        super().__init__()
+        self.layer = nn.Linear(8, 16)
+        self.head = nn.Linear(16, 1000)
+        self.head_kd = nn.Linear(16, 1000)
+        self.kd = kd
+
+    def forward(self, x):
+        feature = self.layer(x)
+        out = self.head(feature)
+        if self.kd:
+            out_kd = self.head_kd(feature)
+            return out, out_kd
+        else:
+            return out
+
+
+class LossesTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+        cls.num_classes = 1000
+        cls.batch_size = 4
+        cls.x = paddle.rand(shape=[cls.batch_size, cls.num_classes])
+        cls.target = cls.x.argmax(axis=-1)
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    @unittest.skip('skip for debug')
+    def test_soft_target_crossentropy_loss(self):
+        x = LossesTest.x
+        target = LossesTest.target
+        soft_target = paddle.zeros([LossesTest.batch_size, LossesTest.num_classes])
+        rand_idx = np.random.randint(0, LossesTest.num_classes - 1, size=LossesTest.batch_size) 
+        for i, idx in enumerate(rand_idx):
+            soft_target[i, idx] = 0.6
+            soft_target[i, idx + 1] = 0.4
+
+        criterion = SoftTargetCrossEntropyLoss()
+        loss = criterion(x, soft_target)
+
+        logprob = -nn.functional.log_softmax(x, axis=-1)
+        true_loss = []
+        for i, idx in enumerate(rand_idx):
+            true_loss.append(logprob[i,int(idx)] * 0.6 + logprob[i, int(idx+1)] * 0.4)
+        true_loss = np.array(true_loss).mean()
+
+        self.assertAlmostEqual(loss.numpy()[0], true_loss, delta=1e-5)
+
+    
+    #@unittest.skip('skip for debug')
+    def test_label_smoothing_crossentropy_loss(self):
+        x = paddle.to_tensor([[0.2, 0.3, 0.4, 0.1],[0.6, 0.2, 0.1, 0.1]])
+        target = paddle.to_tensor([2, 1])
+        criterion = LabelSmoothingCrossEntropyLoss(smoothing=0.3)
+        loss = criterion(x, target)
+
+        val = -paddle.nn.functional.log_softmax(x, axis=-1)
+        true_loss = val[0][2] * 0.7  + val[1][1] * 0.7 + val[0,:].mean() * 0.3 + val[1,:].mean()* 0.3 
+        true_loss = true_loss/2.0
+
+        self.assertAlmostEqual(true_loss.numpy()[0], loss.numpy()[0], delta=1e-5)
+
+
+    #@unittest.skip('skip for debug')
+    def test_distillation_loss(self):
+        model = DummyModel(kd=True)
+        teacher_model = DummyModel(kd=False)
+        x = paddle.randn([4, 8])
+        out, out_kd = model(x)
+        labels = paddle.randint(0, 999, [4])
+
+        base_criterion = nn.CrossEntropyLoss()
+        criterion = DistillationLoss(base_criterion,
+                                     teacher_model,
+                                     'none',
+                                     alpha=0.3,
+                                     tau=0.8)
+        loss = criterion(x, (out, out_kd), labels)
+        self.assertEqual(loss.shape, [1])
+
+        criterion = DistillationLoss(base_criterion,
+                                     teacher_model,
+                                     'hard',
+                                     alpha=0.3,
+                                     tau=0.8)
+        loss = criterion(x, (out, out_kd), labels)
+        self.assertEqual(loss.shape, [1])
+
+        criterion = DistillationLoss(base_criterion,
+                                     teacher_model,
+                                     'soft',
+                                     alpha=0.3,
+                                     tau=0.8)
+        loss = criterion(x, (out, out_kd), labels)
+        self.assertEqual(loss.shape, [1])
+
+
diff --git a/image_classification/DeiT/tests/test_mixup.py b/image_classification/DeiT/tests/test_mixup.py
new file mode 100644
index 00000000..79e4dba4
--- /dev/null
+++ b/image_classification/DeiT/tests/test_mixup.py
@@ -0,0 +1,135 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+from mixup import *
+
+
+class MixupTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    #@unittest.skip('skip for debug')
+    def test_rand_bbox(self):
+        image_shape = [4, 3, 224, 224] 
+        lam = 0.2
+        cut_rate = np.sqrt(1. - lam)
+        for i in range(20):
+            x1, y1, x2, y2 = rand_bbox(image_shape, lam)
+            #print(x1, y1, x2, y2)
+            h = x2 - x1
+            w = y2 - y1
+            self.assertTrue(0 <= x1 <= 224)
+            self.assertTrue(0 <= y1 <= 224)
+            self.assertTrue(0 <= x2 <= 224)
+            self.assertTrue(0 <= y2 <= 224)
+            self.assertTrue(h <= int(cut_rate * 224))
+            self.assertTrue(w <= int(cut_rate * 224))
+
+    def test_rand_bbox_minmax(self):
+        image_shape = [4, 3, 224, 224] 
+        minmax = [0.1, 0.3]
+        for i in range(20):
+            x1, y1, x2, y2 = rand_bbox_minmax(image_shape, minmax)
+            h = x2 - x1
+            w = y2 - y1
+            self.assertTrue(0 <= x1 <= 224)
+            self.assertTrue(0 <= y1 <= 224)
+            self.assertTrue(0 <= x2 <= 224)
+            self.assertTrue(0 <= y2 <= 224)
+            self.assertTrue(h >= int(minmax[0]* 224))
+            self.assertTrue(w >= int(minmax[0]* 224))
+            self.assertTrue(h <= int(minmax[1]* 224))
+            self.assertTrue(w <= int(minmax[1]* 224))
+
+    #@unittest.skip('skip for debug')
+    def test_cutmix_generate_bbox_adjust_lam_lam(self):
+        image_shape = [4, 3, 224, 224] 
+        orig_lam = 0.2
+        cut_rate = np.sqrt(1. - orig_lam)
+        minmax = None
+        (x1, y1, x2, y2), lam = cutmix_generate_bbox_adjust_lam(image_shape, orig_lam, minmax)
+        h = x2 - x1
+        w = y2 - y1
+        self.assertTrue(0 <= x1 <= 224)
+        self.assertTrue(0 <= y1 <= 224)
+        self.assertTrue(0 <= x2 <= 224)
+        self.assertTrue(0 <= y2 <= 224)
+        self.assertTrue(h <= cut_rate * 224)
+        self.assertTrue(w <=cut_rate * 224)
+        self.assertNotEqual(orig_lam, lam)
+
+    #@unittest.skip('skip for debug')
+    def test_cutmix_generate_bbox_adjust_lam_minmax(self):
+        image_shape = [4, 3, 224, 224] 
+        orig_lam = 0.2
+        minmax = [0.1, 0.3]
+        (x1, y1, x2, y2), lam = cutmix_generate_bbox_adjust_lam(image_shape, orig_lam, minmax)
+        h = x2 - x1
+        w = y2 - y1
+        self.assertTrue(0 <= x1 <= 224)
+        self.assertTrue(0 <= y1 <= 224)
+        self.assertTrue(0 <= x2 <= 224)
+        self.assertTrue(0 <= y2 <= 224)
+        self.assertTrue(h >= minmax[0]* 224 - 1)
+        self.assertTrue(w >= minmax[0]* 224 - 1)
+        self.assertTrue(h <= minmax[1]* 224 - 1)
+        self.assertTrue(w <= minmax[1]* 224 - 1)
+        self.assertNotEqual(orig_lam, lam)
+
+    #@unittest.skip('skip for debug')
+    def test_one_hot(self):
+        num_classes = 10
+        x = paddle.randint(0, num_classes, [4])
+        x_smoothed = one_hot(x, num_classes, on_value=0.8, off_value=0.2)
+        for i in range(4):
+            self.assertEqual(x_smoothed[i, x[i]], 0.8)
+            for j in range(num_classes):
+                if j != x[i]:
+                    self.assertEqual(x_smoothed[i, j], 0.2)
+
+    #@unittest.skip('skip for debug')
+    def test_mixup_one_hot(self):
+        num_classes = 10
+        x = paddle.randint(0, num_classes, [4])
+        x_mixup = mixup_one_hot(x, num_classes, lam=0.8, smoothing=0.2)
+        off_value = 0.2 / 10
+        on_value = 1. - 0.2 + off_value
+        for i in range(4):
+            if x[i] != x[-(i+1)]:
+                self.assertAlmostEqual(x_mixup[i, x[i]].numpy()[0], on_value*0.8 + off_value * 0.2, places=4)
+            else:
+                self.assertAlmostEqual(x_mixup[i, x[i]].numpy()[0], on_value*0.8 + on_value * 0.2, places=4)
+
+    #@unittest.skip('skip for debug')
+    def test_mixup(self):
+        x = paddle.randn([4, 3, 224, 224])
+        label = paddle.randint(0, 10, [4])
+        mixup_fn = Mixup(num_classes=10, cutmix_alpha=1.0) 
+        x_new, label_new = mixup_fn(x, label)
+        self.assertEqual(x_new.shape, x.shape)
+        self.assertEqual(label_new.shape, [4, 10])
+
+        mixup_fn = Mixup(num_classes=10, cutmix_alpha=0.2) 
+        x_new, label_new = mixup_fn(x, label)
+        self.assertEqual(x_new.shape, x.shape)
+        self.assertEqual(label_new.shape, [4, 10])
diff --git a/image_classification/DeiT/utils.py b/image_classification/DeiT/utils.py
new file mode 100644
index 00000000..44800527
--- /dev/null
+++ b/image_classification/DeiT/utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/image_classification/MLP-Mixer/README.md b/image_classification/MLP-Mixer/README.md
new file mode 100644
index 00000000..d10525cc
--- /dev/null
+++ b/image_classification/MLP-Mixer/README.md
@@ -0,0 +1,166 @@
+# MLP-Mixer: An all-MLP Architecture for Vision, [arxiv](https://arxiv.org/abs/2105.01601) 
+
+PaddlePaddle training/validation code and pretrained models for **MLP-Mixer**.
+
+The official TF implementation is [here](https://github.com/google-research/vision_transformer).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./mlp_mixer.png" alt="drawing" width="90%"/>
+    <h4 align="center">MLP-Mixer Model Overview</h4>
+</p>
+
+### Update 
+Update (2021-08-11): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| mlp_mixer_b16_224                  | 76.60 | 92.23 | 224        | 0.875    | bicubic       | [google](https://drive.google.com/file/d/1ZcQEH92sEPvYuDc6eYZgssK5UjYomzUD/view?usp=sharing)/[baidu](https://pan.baidu.com/s/12nZaWGMOXwrCMOIBfUuUMA)(xh8x) |
+| mlp_mixer_l16_224           | 72.06 | 87.67 | 224        | 0.875      | bicubic      | [google](https://drive.google.com/file/d/1mkmvqo5K7JuvqGm92a-AdycXIcsv1rdg/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1AmSVpwCaGR9Vjsj_boL7GA)(8q7r) |
+
+> *The results are evaluated on ImageNet2012 validation set.
+
+> Note: MLP-Mixer weights are ported from [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/mlp_mixer.py)
+)
+
+
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`..
+
+For example, assume the downloaded weight file is stored in `./mixer_b16_224.pdparams`, to use the `mixer_b16_224` model in python:
+```python
+from config import get_config
+from mlp_mixer import build_mlp_mixer as build_model
+# config files in ./configs/
+config = get_config('./configs/mixer_b16_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./mixer_b16_224')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate MLP-Mixer model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/mixer_b16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./mixer_b16_224'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/mixer_b16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./mixer_b16_224'
+```
+
+</details>
+
+## Training
+To train the MLP-Mixer Transformer model on ImageNet2012 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/mixer_b16_224.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=32 \
+  -data_path='/dataset/imagenet' \
+```
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/mixer_b16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@article{tolstikhin2021mlp,
+  title={Mlp-mixer: An all-mlp architecture for vision},
+  author={Tolstikhin, Ilya and Houlsby, Neil and Kolesnikov, Alexander and Beyer, Lucas and Zhai, Xiaohua and Unterthiner, Thomas and Yung, Jessica and Keysers, Daniel and Uszkoreit, Jakob and Lucic, Mario and others},
+  journal={arXiv preprint arXiv:2105.01601},
+  year={2021}
+}
+```
diff --git a/image_classification/MLP-Mixer/config.py b/image_classification/MLP-Mixer/config.py
new file mode 100644
index 00000000..3dc24935
--- /dev/null
+++ b/image_classification/MLP-Mixer/config.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+
+"""
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 256 #256 # train batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #64 # val batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size: 224 for pretrain, 384 for finetune
+_C.DATA.CROP_PCT = 1.0 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'MLP-Mixer'
+_C.MODEL.NAME = 'MLP-Mixer'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.1
+_C.MODEL.DROPPATH = 0.1
+
+# transformer settings
+_C.MODEL.MIXER = CN()
+_C.MODEL.MIXER.PATCH_SIZE = 16
+_C.MODEL.MIXER.HIDDEN_SIZE = 768
+_C.MODEL.MIXER.NUM_LAYERS = 12
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size
+_C.TRAIN.WEIGHT_DECAY = 0.01 #0.3 # 0.0 for finetune
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0
+_C.TRAIN.END_LR = 1e-5
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 20 # freq to save chpt
+_C.REPORT_FREQ = 50 # freq to logging info
+_C.VALIDATE_FREQ = 20 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = 1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/MLP-Mixer/configs/mixer_b16_224.yaml b/image_classification/MLP-Mixer/configs/mixer_b16_224.yaml
new file mode 100644
index 00000000..ce413827
--- /dev/null
+++ b/image_classification/MLP-Mixer/configs/mixer_b16_224.yaml
@@ -0,0 +1,11 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: Mixer
+    NAME: mixer_base_patch16_224
+    MIXER:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 768
+        NUM_LAYERS: 12
+
diff --git a/image_classification/MLP-Mixer/configs/mixer_l16_224.yaml b/image_classification/MLP-Mixer/configs/mixer_l16_224.yaml
new file mode 100644
index 00000000..0dc9246f
--- /dev/null
+++ b/image_classification/MLP-Mixer/configs/mixer_l16_224.yaml
@@ -0,0 +1,10 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: Mixer
+    NAME: mixer_large_patch16_224
+    MIXER:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024
+        NUM_LAYERS: 24
diff --git a/image_classification/MLP-Mixer/datasets.py b/image_classification/MLP-Mixer/datasets.py
new file mode 100644
index 00000000..e207f9ba
--- /dev/null
+++ b/image_classification/MLP-Mixer/datasets.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/MLP-Mixer/droppath.py b/image_classification/MLP-Mixer/droppath.py
new file mode 100644
index 00000000..fcff05e9
--- /dev/null
+++ b/image_classification/MLP-Mixer/droppath.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import paddle
+import paddle.nn as nn
+
+def drop_path(inputs, drop_prob=0., training=False):
+    """drop path op
+    Args:
+        input: tensor with arbitrary shape
+        drop_prob: float number of drop path probability, default: 0.0
+        training: bool, if current mode is training, default: False
+    Returns:
+        output: output tensor after drop path
+    """
+    # if prob is 0 or eval mode, return original input
+    if drop_prob == 0. or not training:
+        return inputs
+    keep_prob = 1 - drop_prob
+    shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+    random_tensor = random_tensor.floor() # mask
+    output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation
+    return output
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, inputs):
+        return drop_path(inputs, self.drop_prob, self.training)
diff --git a/image_classification/MLP-Mixer/main_multi_gpu.py b/image_classification/MLP-Mixer/main_multi_gpu.py
new file mode 100644
index 00000000..b188a70f
--- /dev/null
+++ b/image_classification/MLP-Mixer/main_multi_gpu.py
@@ -0,0 +1,365 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MLP-Mixer training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader, get_dataset
+from mlp_mixer import build_mlp_mixer as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('MLP-Mixer')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn([
+            #    'absolute_pos_embed', 'relative_position_bias_table']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/MLP-Mixer/main_single_gpu.py b/image_classification/MLP-Mixer/main_single_gpu.py
new file mode 100644
index 00000000..77b3c591
--- /dev/null
+++ b/image_classification/MLP-Mixer/main_single_gpu.py
@@ -0,0 +1,333 @@
+
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MLP-Mixer training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from mlp_mixer import build_mlp_mixer as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('MLP-Mixer')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+    # 1. Create model
+    model = build_model(config)
+    #model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  )
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/MLP-Mixer/mlp_mixer.png b/image_classification/MLP-Mixer/mlp_mixer.png
new file mode 100644
index 00000000..f8ec77f2
Binary files /dev/null and b/image_classification/MLP-Mixer/mlp_mixer.png differ
diff --git a/image_classification/MLP-Mixer/mlp_mixer.py b/image_classification/MLP-Mixer/mlp_mixer.py
new file mode 100644
index 00000000..287ff846
--- /dev/null
+++ b/image_classification/MLP-Mixer/mlp_mixer.py
@@ -0,0 +1,243 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This module implements MLP-Mixer
+MLP-Mixer: An all-MLP Architecture for Vision
+https://arxiv.org/abs/2105.01601
+"""
+
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+
+    The output of this layer is the input without any change.
+    Use this layer to avoid if condition in some forward methods
+
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embeddings
+
+    Apply patch embeddings on input images. Embeddings is implemented using a Conv2D op.
+
+    Attributes:
+        image_size: int, input image size, default: 224
+        patch_size: int, size of patch, default: 4
+        in_channels: int, input image channels, default: 3
+        embed_dim: int, embedding dimension, default: 96
+    """
+
+    def __init__(self, image_size=224, patch_size=4, in_channels=3, embed_dim=96, norm_layer=None):
+        super(PatchEmbedding, self).__init__()
+        image_size = (image_size, image_size)
+        patch_size = (patch_size, patch_size)
+        patches_resolution = [image_size[0]//patch_size[0], image_size[1]//patch_size[1]]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        self.patch_embed = nn.Conv2D(in_channels=in_channels,
+                                     out_channels=embed_dim,
+                                     kernel_size=patch_size,
+                                     stride=patch_size)
+        self.norm = norm_layer if norm_layer is not None else Identity()
+
+    def forward(self, x):
+        x = self.patch_embed(x) # [batch, embed_dim, h, w] h,w = patch_resolution
+        x = x.flatten(start_axis=2, stop_axis=-1) # [batch, embed_dim, h*w] h*w = num_patches
+        x = x.transpose([0, 2, 1]) # [batch, h*w, embed_dim]
+        x = self.norm(x) # [batch, num_patches, embed_dim]
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+
+    def __init__(self, in_features, hidden_features, dropout):
+        super(Mlp, self).__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             in_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class MixerBlock(nn.Layer):
+    """Mixer Block
+
+    This block implements Mixer layer which contains 2 MLP blocks and residuals.
+    The 1st is token-mixing MLP, the 2nd is channel-mixing MLP.
+
+    Attributes:
+        mlp_tokens: Mlp layer for token mixing
+        mlp_channels: Mlp layer for channel mixing
+        tokens_dim: mlp hidden dim for mlp_tokens
+        channels_dim: mlp hidden dim for mlp_channels
+        norm1: nn.LayerNorm, apply before mlp_tokens
+        norm2: nn.LayerNorm, apply before mlp_channels
+    """
+
+    def __init__(self, dim, seq_len, mlp_ratio=(0.5, 4.0), dropout=0., droppath=0.):
+        super(MixerBlock, self).__init__()
+        tokens_dim = int(mlp_ratio[0] * dim)
+        channels_dim = int(mlp_ratio[1] * dim)
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.mlp_tokens = Mlp(seq_len, tokens_dim, dropout=dropout)
+        self.drop_path = DropPath(droppath)
+        self.norm2 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.mlp_channels = Mlp(dim, channels_dim, dropout=dropout)
+
+    def forward(self, x):
+        h = x
+        x = self.norm1(x)
+        x = x.transpose([0, 2, 1])
+        x = self.mlp_tokens(x)
+        x = x.transpose([0, 2, 1])
+        x = self.drop_path(x)
+        x = x + h
+
+        h = x
+        x = self.norm2(x)
+        x = self.mlp_channels(x)
+        x = self.drop_path(x)
+        x = x + h
+
+        return x
+
+
+class MlpMixer(nn.Layer):
+    """MlpMixer model
+    Args:
+        num_classes: int, num of image classes, default: 1000
+        image_size: int, input image size, default: 224
+        in_channels: int, input image channels, default: 3
+        patch_size: int, patch size, default: 16
+        num_mixer_layers: int, number of mixer blocks, default: 8
+        embed_dim: int, output dimension of patch embedding, default: 512
+        mlp_ratio: tuple(float, float), mlp scales for mlp token and mlp channels,
+                   mlp_tokens hidden dim = mlp_ratio[0] * embed_dim,
+                   mlp_channels hidden dim = mlp_ratio[1] * embed_dim,
+                   default: (0.5, 4.0)
+        dropout: float, dropout rate for mlp, default: 0.
+        droppath: float, droppath rate for mixer block, default: 0.
+        patch_embed_norm: bool, if True, apply norm in patch embedding, default: False
+    """
+    def __init__(self,
+                 num_classes=1000,
+                 image_size=224,
+                 in_channels=3,
+                 patch_size=16,
+                 num_mixer_layers=8,
+                 embed_dim=512,
+                 mlp_ratio=(0.5, 4.0),
+                 dropout=0.,
+                 droppath=0.,
+                 patch_embed_norm=False):
+        super(MlpMixer, self).__init__()
+        self.num_classes = num_classes
+        self.num_features = embed_dim
+        self.embed_dim = embed_dim
+
+        norm_layer = nn.LayerNorm(embed_dim, epsilon=1e-6)
+        self.patch_embed = PatchEmbedding(
+            image_size=image_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if patch_embed_norm else None)
+
+        self.mixer_layers = nn.Sequential(
+            *[MixerBlock(embed_dim,
+                         self.patch_embed.num_patches,
+                         mlp_ratio,
+                         dropout,
+                         droppath) for _ in range(num_mixer_layers)])
+
+        self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
+        self.head = nn.Linear(embed_dim, self.num_classes)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = self.mixer_layers(x)
+        x = self.norm(x)
+        x = x.mean(axis=1)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def build_mlp_mixer(config):
+    """Build mlp mixer by reading options in config object
+    Args:
+        config: config instance contains setting options
+    Returns:
+        model: MlpMixer model
+    """
+
+    model = MlpMixer(num_classes=config.MODEL.NUM_CLASSES,
+                     image_size=config.DATA.IMAGE_SIZE,
+                     in_channels=3,
+                     num_mixer_layers=config.MODEL.MIXER.NUM_LAYERS,
+                     embed_dim=config.MODEL.MIXER.HIDDEN_SIZE,
+                     mlp_ratio=(0.5, 4.0),
+                     dropout=config.MODEL.DROPOUT,
+                     droppath=config.MODEL.DROPPATH)
+    return model
diff --git a/image_classification/MLP-Mixer/port_weights/load_pytorch_weights.py b/image_classification/MLP-Mixer/port_weights/load_pytorch_weights.py
new file mode 100644
index 00000000..c2613232
--- /dev/null
+++ b/image_classification/MLP-Mixer/port_weights/load_pytorch_weights.py
@@ -0,0 +1,158 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from mlp_mixer import build_mlp_mixer
+from config import get_config
+
+config = get_config('./configs/mixer_b16_224.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stem.proj', 'patch_embed.patch_embed'),
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+
+    num_layers = 12
+    for idx in range(num_layers):
+        pp_prefix = f'mixer_layers.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp_tokens.fc1', f'{pp_prefix}.mlp_tokens.fc1'),
+            (f'{th_prefix}.mlp_tokens.fc2', f'{pp_prefix}.mlp_tokens.fc2'),
+            (f'{th_prefix}.mlp_channels.fc1', f'{pp_prefix}.mlp_channels.fc1'),
+            (f'{th_prefix}.mlp_channels.fc2', f'{pp_prefix}.mlp_channels.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'**SET** {th_name} {th_shape} **TO** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_mlp_mixer(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('mixer_b16_224', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0:100])
+    print(out_paddle[0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./mixer_b16_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+    print('all done')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/MLP-Mixer/port_weights/load_pytorch_weights_large.py b/image_classification/MLP-Mixer/port_weights/load_pytorch_weights_large.py
new file mode 100644
index 00000000..4992473b
--- /dev/null
+++ b/image_classification/MLP-Mixer/port_weights/load_pytorch_weights_large.py
@@ -0,0 +1,158 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from mlp_mixer import build_mlp_mixer
+from config import get_config
+
+config = get_config('./configs/mixer_l16_224.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stem.proj', 'patch_embed.patch_embed'),
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+
+    num_layers = 24
+    for idx in range(num_layers):
+        pp_prefix = f'mixer_layers.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp_tokens.fc1', f'{pp_prefix}.mlp_tokens.fc1'),
+            (f'{th_prefix}.mlp_tokens.fc2', f'{pp_prefix}.mlp_tokens.fc2'),
+            (f'{th_prefix}.mlp_channels.fc1', f'{pp_prefix}.mlp_channels.fc1'),
+            (f'{th_prefix}.mlp_channels.fc2', f'{pp_prefix}.mlp_channels.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'**SET** {th_name} {th_shape} **TO** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_mlp_mixer(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('mixer_l16_224', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0:100])
+    print(out_paddle[0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./mixer_l16_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+    print('all done')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/MLP-Mixer/run_eval.sh b/image_classification/MLP-Mixer/run_eval.sh
new file mode 100644
index 00000000..e31a49e7
--- /dev/null
+++ b/image_classification/MLP-Mixer/run_eval.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/mixer_b16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./mixer_b16_224'
diff --git a/image_classification/MLP-Mixer/run_eval_large.sh b/image_classification/MLP-Mixer/run_eval_large.sh
new file mode 100644
index 00000000..06006dd2
--- /dev/null
+++ b/image_classification/MLP-Mixer/run_eval_large.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/mixer_l16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=16 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./mixer_l16_224'
diff --git a/image_classification/MLP-Mixer/run_eval_multi.sh b/image_classification/MLP-Mixer/run_eval_multi.sh
new file mode 100644
index 00000000..9b3293e1
--- /dev/null
+++ b/image_classification/MLP-Mixer/run_eval_multi.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/mixer_b16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=16 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./mixer_b16_224' \
+-ngpus=4
diff --git a/image_classification/MLP-Mixer/run_eval_multi_large.sh b/image_classification/MLP-Mixer/run_eval_multi_large.sh
new file mode 100644
index 00000000..5eec65d7
--- /dev/null
+++ b/image_classification/MLP-Mixer/run_eval_multi_large.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/mixer_l16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=8 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./mixer_l16_224' \
diff --git a/image_classification/MLP-Mixer/run_train.sh b/image_classification/MLP-Mixer/run_train.sh
new file mode 100644
index 00000000..725fd11d
--- /dev/null
+++ b/image_classification/MLP-Mixer/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/mixer_b16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/MLP-Mixer/run_train_multi.sh b/image_classification/MLP-Mixer/run_train_multi.sh
new file mode 100644
index 00000000..5537081f
--- /dev/null
+++ b/image_classification/MLP-Mixer/run_train_multi.sh
@@ -0,0 +1,7 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/mixer_b16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-ngpus=4
diff --git a/image_classification/MLP-Mixer/tests/__init__.py b/image_classification/MLP-Mixer/tests/__init__.py
new file mode 100644
index 00000000..84952a81
--- /dev/null
+++ b/image_classification/MLP-Mixer/tests/__init__.py
@@ -0,0 +1 @@
+# init
\ No newline at end of file
diff --git a/image_classification/MLP-Mixer/tests/test_mlp_mixer.py b/image_classification/MLP-Mixer/tests/test_mlp_mixer.py
new file mode 100644
index 00000000..71bd59a5
--- /dev/null
+++ b/image_classification/MLP-Mixer/tests/test_mlp_mixer.py
@@ -0,0 +1,94 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+from config import *
+from mlp_mixer import Identity
+from mlp_mixer import PatchEmbedding
+from mlp_mixer import Mlp
+from mlp_mixer import MixerBlock
+from mlp_mixer import MlpMixer
+from mlp_mixer import build_mlp_mixer
+
+
+class MlpTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+        cls.config = get_config()
+        cls.dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        cls.dummy_tensor = paddle.to_tensor(cls.dummy_img)
+        cls.model = build_mlp_mixer(cls.config)
+
+    @classmethod
+    def tearDown(cls):
+        pass
+    
+    #@unittest.skip('skip for debug')
+    def test_out_shape(self):
+        out = MlpTest.model(MlpTest.dummy_tensor)
+        self.assertEqual(out.shape, [4, 1000])
+
+    #@unittest.skip('skip for debug')
+    def test_all_parameters_updated(self):
+        optim = paddle.optimizer.SGD(
+            parameters=MlpTest.model.parameters(), learning_rate=0.1)
+        out = MlpTest.model(MlpTest.dummy_tensor)
+        loss = out.mean()
+        loss.backward()
+        optim.step()
+    
+        for name, param in MlpTest.model.named_parameters():
+            if not param.stop_gradient:
+                self.assertIsNotNone(param.gradient())
+                self.assertNotEqual(0, np.sum(param.gradient()**2))
+    
+    #@unittest.skip('skip for debug')
+    def test_embeddings(self):
+        embed = PatchEmbedding(embed_dim=768)
+        dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+    
+        embed_out = embed(dummy_tensor)
+        self.assertEqual(embed_out.shape, [4, 3136, 768])
+
+    #@unittest.skip('skip for debug')
+    def test_mlp(self):
+        mlp_op = Mlp(768, 256, 0.0)
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+    
+        out = mlp_op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+
+    #@unittest.skip('skip for debug')
+    def test_identity(self):
+        op = Identity()
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+    
+        out = op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+    
+    #@unittest.skip('skip for debug')
+    def test_mixer_block(self):
+        op = MixerBlock(dim=768, seq_len=50)
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+    
+        out = op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
diff --git a/image_classification/MLP-Mixer/utils.py b/image_classification/MLP-Mixer/utils.py
new file mode 100644
index 00000000..44800527
--- /dev/null
+++ b/image_classification/MLP-Mixer/utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/image_classification/PVTv2/README.md b/image_classification/PVTv2/README.md
new file mode 100644
index 00000000..925af848
--- /dev/null
+++ b/image_classification/PVTv2/README.md
@@ -0,0 +1,168 @@
+# PVTv2: Improved Baselines with Pyramid Vision Transformer, [arxiv](https://arxiv.org/abs/2106.13797) 
+
+PaddlePaddle training/validation code and pretrained models for **PVTv2**.
+
+The official pytorch implementation is [here](https://github.com/whai362/PVT).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./pvtv2.png" alt="drawing" width="60%" height="60%"/>
+<h4 align="center">PVTv2 Model Overview</h4>
+</p>
+
+
+### Update 
+Update (2021-08-11): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| pvtv2_b0 			| 70.47	| 90.16	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/1wkx4un6y7V87Rp_ZlD4_pV63QRst-1AE/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1mab4dOtBB-HsdzFJYrvgjA)(dxgb) |
+| pvtv2_b1 			| 78.70	| 94.49	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/11hqLxL2MTSnKPb-gp2eMZLAzT6q2UsmG/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1Ur0s4SEOxVqggmgq6AM-sQ)(2e5m) |
+| pvtv2_b2 			| 82.02	| 95.99	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/1-KY6NbS3Y3gCaPaUam0v_Xlk1fT-N1Mz/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1FWx0QB7_8_ikrPIOlL7ung)(are2) |
+| pvtv2_b3 			| 83.14	| 96.47	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/16yYV8x7aKssGYmdE-YP99GMg4NKGR5j1/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1ge0rBsCqIcpIjrVxsrFhnw)(nc21) |
+| pvtv2_b4 			| 83.61	| 96.69	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/1gvPdvDeq0VchOUuriTnnGUKh0N2lj-fA/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1VMSD_Kr_hduCZ5dxmDbLoA)(tthf) |
+| pvtv2_b5 			| 83.77	| 96.61	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/1OHaHiHN_AjsGYBN2gxFcQCDhBbTvZ02g/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1ey4agxI2Nb0F6iaaX3zAbA)(9v6n) |
+| pvtv2_b2_linear 	| 82.06	| 96.04	| 224 | 0.875 | bicubic | [google](https://drive.google.com/file/d/1hC8wE_XanMPi0_y9apEBKzNc4acZW5Uy/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1IAhiiaJPe-Lg1Qjxp2p30w)(a4c8) |
+
+> *The results are evaluated on ImageNet2012 validation set.
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./pvtv2_b0.pdparams`, to use the `pvtv2_b0` model in python:
+```python
+from config import get_config
+from pvtv2 import build_pvtv2 as build_model
+# config files in ./configs/
+config = get_config('./configs/pvtv2_b0.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./pvtv2_b0')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate PVTv2 model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/pvtv2_b0.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./pvtv2_b0'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/pvtv2_b0.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./pvtv2_b0'
+```
+
+</details>
+
+
+## Training
+To train the PVTv2 Transformer model on ImageNet2012 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/pvtv2_b0.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=16 \
+  -data_path='/dataset/imagenet' \
+```
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python main_multi_gpu.py \
+    -cfg='./configs/pvtv2_b0.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=32 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@article{wang2021pvtv2,
+  title={Pvtv2: Improved baselines with pyramid vision transformer},
+  author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
+  journal={arXiv preprint arXiv:2106.13797},
+  year={2021}
+}
+```
diff --git a/image_classification/PVTv2/config.py b/image_classification/PVTv2/config.py
new file mode 100644
index 00000000..d63aea34
--- /dev/null
+++ b/image_classification/PVTv2/config.py
@@ -0,0 +1,154 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+
+"""
+
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 4 #1024 batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 4 #1024 batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size
+_C.DATA.CROP_PCT = 0.875 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'PVTv2'
+_C.MODEL.NAME = 'pvtv2_tiny_224'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.0
+_C.MODEL.ATTENTION_DROPOUT = 0.0
+_C.MODEL.DROP_PATH = 0.1
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.PATCH_SIZE = 4
+_C.MODEL.TRANS.IN_CHANNELS = 3
+_C.MODEL.TRANS.EMBED_DIMS = [32, 64, 160, 256] # same as HIDDEN_SIZE in ViT
+_C.MODEL.TRANS.STAGE_DEPTHS = [2, 2, 2, 2]
+_C.MODEL.TRANS.NUM_HEADS = [1, 2, 5, 8]
+_C.MODEL.TRANS.MLP_RATIO = [8, 8, 4, 4]
+_C.MODEL.TRANS.SR_RATIO = [8, 4, 2, 1]
+_C.MODEL.TRANS.QKV_BIAS = True
+_C.MODEL.TRANS.QK_SCALE = None
+_C.MODEL.TRANS.LINEAR = False
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 20
+_C.TRAIN.WEIGHT_DECAY = 0.05
+_C.TRAIN.BASE_LR = 0.001
+_C.TRAIN.WARMUP_START_LR = 0.0
+_C.TRAIN.END_LR = 0.0
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'SGD'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 20 # freq to save chpt
+_C.REPORT_FREQ = 50 # freq to logging info
+_C.VALIDATE_FREQ = 20 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/PVTv2/configs/pvtv2_b0.yaml b/image_classification/PVTv2/configs/pvtv2_b0.yaml
new file mode 100644
index 00000000..c8854b95
--- /dev/null
+++ b/image_classification/PVTv2/configs/pvtv2_b0.yaml
@@ -0,0 +1,18 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: PVTv2
+    NAME: pvtv2_b0
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIMS: [32, 64, 160, 256]
+        STAGE_DEPTHS: [2, 2, 2, 2]
+        NUM_HEADS: [1, 2, 5, 8]
+        MLP_RATIO: [8, 8, 4, 4]
+        SR_RATIO: [8, 4, 2, 1]
+        QKV_BIAS: True
+    DROP_PATH: 0.1
+TRAIN:
+    GRAD_CLIP: None
+
diff --git a/image_classification/PVTv2/configs/pvtv2_b1.yaml b/image_classification/PVTv2/configs/pvtv2_b1.yaml
new file mode 100644
index 00000000..95135935
--- /dev/null
+++ b/image_classification/PVTv2/configs/pvtv2_b1.yaml
@@ -0,0 +1,18 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: PVTv2
+    NAME: pvtv2_b1
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIMS: [64, 128, 320, 512]
+        STAGE_DEPTHS: [2, 2, 2, 2]
+        NUM_HEADS: [1, 2, 5, 8]
+        MLP_RATIO: [8, 8, 4, 4]
+        SR_RATIO: [8, 4, 2, 1]
+        QKV_BIAS: True
+    DROP_PATH: 0.1
+TRAIN:
+    GRAD_CLIP: None
+
diff --git a/image_classification/PVTv2/configs/pvtv2_b2.yaml b/image_classification/PVTv2/configs/pvtv2_b2.yaml
new file mode 100644
index 00000000..5102f3d3
--- /dev/null
+++ b/image_classification/PVTv2/configs/pvtv2_b2.yaml
@@ -0,0 +1,18 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: PVTv2
+    NAME: pvtv2_b2
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIMS: [64, 128, 320, 512]
+        STAGE_DEPTHS: [3, 4, 6, 3]
+        NUM_HEADS: [1, 2, 5, 8]
+        MLP_RATIO: [8, 8, 4, 4]
+        SR_RATIO: [8, 4, 2, 1]
+        QKV_BIAS: True
+    DROP_PATH: 0.1
+TRAIN:
+    GRAD_CLIP: None
+
diff --git a/image_classification/PVTv2/configs/pvtv2_b2_linear.yaml b/image_classification/PVTv2/configs/pvtv2_b2_linear.yaml
new file mode 100644
index 00000000..10e8384c
--- /dev/null
+++ b/image_classification/PVTv2/configs/pvtv2_b2_linear.yaml
@@ -0,0 +1,19 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: PVTv2
+    NAME: pvtv2_b2_linear
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIMS: [64, 128, 320, 512]
+        STAGE_DEPTHS: [3, 4, 6, 3]
+        NUM_HEADS: [1, 2, 5, 8]
+        MLP_RATIO: [8, 8, 4, 4]
+        SR_RATIO: [8, 4, 2, 1]
+        LINEAR: True
+        QKV_BIAS: True
+    DROP_PATH: 0.1
+TRAIN:
+    GRAD_CLIP: None
+
diff --git a/image_classification/PVTv2/configs/pvtv2_b3.yaml b/image_classification/PVTv2/configs/pvtv2_b3.yaml
new file mode 100644
index 00000000..823a1889
--- /dev/null
+++ b/image_classification/PVTv2/configs/pvtv2_b3.yaml
@@ -0,0 +1,18 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: PVTv2
+    NAME: pvtv2_b3
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIMS: [64, 128, 320, 512]
+        STAGE_DEPTHS: [3, 4, 18, 3]
+        NUM_HEADS: [1, 2, 5, 8]
+        MLP_RATIO: [8, 8, 4, 4]
+        SR_RATIO: [8, 4, 2, 1]
+        QKV_BIAS: True
+    DROP_PATH: 0.3
+TRAIN:
+    GRAD_CLIP: 1.0
+
diff --git a/image_classification/PVTv2/configs/pvtv2_b4.yaml b/image_classification/PVTv2/configs/pvtv2_b4.yaml
new file mode 100644
index 00000000..f8f3472e
--- /dev/null
+++ b/image_classification/PVTv2/configs/pvtv2_b4.yaml
@@ -0,0 +1,18 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: PVTv2
+    NAME: pvtv2_b4
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIMS: [64, 128, 320, 512]
+        STAGE_DEPTHS: [3, 8, 27, 3]
+        NUM_HEADS: [1, 2, 5, 8]
+        MLP_RATIO: [8, 8, 4, 4]
+        SR_RATIO: [8, 4, 2, 1]
+        QKV_BIAS: True
+    DROP_PATH: 0.3
+TRAIN:
+    GRAD_CLIP: 1.0
+
diff --git a/image_classification/PVTv2/configs/pvtv2_b5.yaml b/image_classification/PVTv2/configs/pvtv2_b5.yaml
new file mode 100644
index 00000000..fea21eb1
--- /dev/null
+++ b/image_classification/PVTv2/configs/pvtv2_b5.yaml
@@ -0,0 +1,18 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: PVTv2
+    NAME: pvtv2_b5
+    TRANS:
+        PATCH_SIZE: 4
+        EMBED_DIMS: [64, 128, 320, 512]
+        STAGE_DEPTHS: [3, 6, 40, 3]
+        NUM_HEADS: [1, 2, 5, 8]
+        MLP_RATIO: [4, 4, 4, 4]
+        SR_RATIO: [8, 4, 2, 1]
+        QKV_BIAS: True
+    DROP_PATH: 0.3
+TRAIN:
+    GRAD_CLIP: 1.0
+
diff --git a/image_classification/PVTv2/datasets.py b/image_classification/PVTv2/datasets.py
new file mode 100644
index 00000000..10ba78fe
--- /dev/null
+++ b/image_classification/PVTv2/datasets.py
@@ -0,0 +1,187 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for PvTv2 training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.485, 0.456, 0.406], mean and [0.229, 0.224, 0.225] std. 
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.485, 0.456, 0.406] mean and [0.229, 0.224, 0.225] std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, interpolation='bicubic'),
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/PVTv2/droppath.py b/image_classification/PVTv2/droppath.py
new file mode 100644
index 00000000..d7ecf00c
--- /dev/null
+++ b/image_classification/PVTv2/droppath.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def drop_path(self, inputs):
+        """drop path op
+        Args:
+            input: tensor with arbitrary shape
+            drop_prob: float number of drop path probability, default: 0.0
+            training: bool, if current mode is training, default: False
+        Returns:
+            output: output tensor after drop path
+        """
+        # if prob is 0 or eval mode, return original input
+        if self.drop_prob == 0. or not self.training:
+            return inputs
+        keep_prob = 1 - self.drop_prob
+        keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+        random_tensor = random_tensor.floor() # mask
+        output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation
+        return output
+
+    def forward(self, inputs):
+        return self.drop_path(inputs)
+
+
+#def main():
+#    tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32')
+#    dp = DropPath(0.5)
+#    out = dp(tmp)
+#    print(out)
+#
+#if __name__ == "__main__":
+#    main()
diff --git a/image_classification/PVTv2/main_multi_gpu.py b/image_classification/PVTv2/main_multi_gpu.py
new file mode 100644
index 00000000..cb8dc67e
--- /dev/null
+++ b/image_classification/PVTv2/main_multi_gpu.py
@@ -0,0 +1,365 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PVTv2 training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader, get_dataset
+from pvtv2 import build_pvtv2 as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('PVTv2')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            apply_decay_param_fun=get_exclude_from_weight_decay_fn([
+                'absolute_pos_embed', 'relative_position_bias_table']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/PVTv2/main_single_gpu.py b/image_classification/PVTv2/main_single_gpu.py
new file mode 100644
index 00000000..f397191a
--- /dev/null
+++ b/image_classification/PVTv2/main_single_gpu.py
@@ -0,0 +1,338 @@
+
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PVTv2 training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from pvtv2 import build_pvtv2 as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('PVTv2')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+    # 1. Create model
+    model = build_model(config)
+    #model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            apply_decay_param_fun=get_exclude_from_weight_decay_fn([
+                'absolute_pos_embed', 'relative_position_bias_table']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_dict(opt_state)
+        logger.info(
+            f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  )
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/PVTv2/port_weights/load_pytorch_weights_b0.py b/image_classification/PVTv2/port_weights/load_pytorch_weights_b0.py
new file mode 100644
index 00000000..86aca23b
--- /dev/null
+++ b/image_classification/PVTv2/port_weights/load_pytorch_weights_b0.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from pvtv2_torch import *
+from pvtv2 import *
+from config import *
+
+
+config = get_config('./configs/pvtv2_b0.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('patch_embed1.proj', 'patch_embedding1.patch_embed'),
+        ('patch_embed1.norm', 'patch_embedding1.norm'),
+        ('patch_embed2.proj', 'patch_embedding2.patch_embed'),
+        ('patch_embed2.norm', 'patch_embedding2.norm'),
+        ('patch_embed3.proj', 'patch_embedding3.patch_embed'),
+        ('patch_embed3.norm', 'patch_embedding3.norm'),
+        ('patch_embed4.proj', 'patch_embedding4.patch_embed'),
+        ('patch_embed4.norm', 'patch_embedding4.norm'),
+        ('norm1', 'norm1'),        
+        ('norm2', 'norm2'),    
+        ('norm3', 'norm3'),    
+        ('norm4', 'norm4'),    
+    ]
+    
+    # torch 'layers' to  paddle 'stages'
+    depths = config.MODEL.TRANS.STAGE_DEPTHS
+    num_stages = len(depths)
+    for stage_idx in range(num_stages):
+        pp_s_prefix = f'block{stage_idx+1}'
+        th_s_prefix = f'block{stage_idx+1}'
+
+        if stage_idx==3:
+            for block_idx in range(depths[stage_idx]):
+                th_b_prefix = f'{th_s_prefix}.{block_idx}'
+                pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+                layer_mapping = [
+                    (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                    (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                    (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                    (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                    (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                    (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                    (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                    (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+                ]
+                mapping.extend(layer_mapping)   
+            break 
+            
+        for block_idx in range(depths[stage_idx]):
+            th_b_prefix = f'{th_s_prefix}.{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                (f'{th_b_prefix}.attn.sr', f'{pp_b_prefix}.attn.sr'),
+                (f'{th_b_prefix}.attn.norm', f'{pp_b_prefix}.attn.norm'),
+                (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+            ]
+            mapping.extend(layer_mapping)
+     
+    mapping.extend([('head', 'head')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_pvtv2(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    torch_model = pvt_v2_b0(pretrained=True)
+    pre=torch.load('./pvtv2_pth_models/pvt_v2_b0.pth')
+    torch_model.load_state_dict(pre)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    #x = np.ones((1, 3, 224, 224)).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-3)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./pvtv2_b0.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/PVTv2/port_weights/load_pytorch_weights_b1.py b/image_classification/PVTv2/port_weights/load_pytorch_weights_b1.py
new file mode 100644
index 00000000..e61a69bd
--- /dev/null
+++ b/image_classification/PVTv2/port_weights/load_pytorch_weights_b1.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from pvtv2_torch import *
+from pvtv2 import *
+from config import *
+
+
+config = get_config('./configs/pvtv2_b1.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('patch_embed1.proj', 'patch_embedding1.patch_embed'),
+        ('patch_embed1.norm', 'patch_embedding1.norm'),
+        ('patch_embed2.proj', 'patch_embedding2.patch_embed'),
+        ('patch_embed2.norm', 'patch_embedding2.norm'),
+        ('patch_embed3.proj', 'patch_embedding3.patch_embed'),
+        ('patch_embed3.norm', 'patch_embedding3.norm'),
+        ('patch_embed4.proj', 'patch_embedding4.patch_embed'),
+        ('patch_embed4.norm', 'patch_embedding4.norm'),
+        ('norm1', 'norm1'),        
+        ('norm2', 'norm2'),    
+        ('norm3', 'norm3'),    
+        ('norm4', 'norm4'),    
+    ]
+    
+    # torch 'layers' to  paddle 'stages'
+    depths = config.MODEL.TRANS.STAGE_DEPTHS
+    num_stages = len(depths)
+    for stage_idx in range(num_stages):
+        pp_s_prefix = f'block{stage_idx+1}'
+        th_s_prefix = f'block{stage_idx+1}'
+
+        if stage_idx==3:
+            for block_idx in range(depths[stage_idx]):
+                th_b_prefix = f'{th_s_prefix}.{block_idx}'
+                pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+                layer_mapping = [
+                    (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                    (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                    (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                    (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                    (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                    (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                    (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                    (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+                ]
+                mapping.extend(layer_mapping)   
+            break 
+            
+        for block_idx in range(depths[stage_idx]):
+            th_b_prefix = f'{th_s_prefix}.{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                (f'{th_b_prefix}.attn.sr', f'{pp_b_prefix}.attn.sr'),
+                (f'{th_b_prefix}.attn.norm', f'{pp_b_prefix}.attn.norm'),
+                (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+            ]
+            mapping.extend(layer_mapping)
+     
+    mapping.extend([('head', 'head')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_pvtv2(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    torch_model = pvt_v2_b1(pretrained=True)
+    pre=torch.load('./pvtv2_pth_models/pvt_v2_b1.pth')
+    torch_model.load_state_dict(pre)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    #x = np.ones((1, 3, 224, 224)).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-3)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./pvtv2_b1.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/PVTv2/port_weights/load_pytorch_weights_b2.py b/image_classification/PVTv2/port_weights/load_pytorch_weights_b2.py
new file mode 100644
index 00000000..967279d3
--- /dev/null
+++ b/image_classification/PVTv2/port_weights/load_pytorch_weights_b2.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from pvtv2_torch import *
+from pvtv2 import *
+from config import *
+
+
+config = get_config('./configs/pvtv2_b2.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('patch_embed1.proj', 'patch_embedding1.patch_embed'),
+        ('patch_embed1.norm', 'patch_embedding1.norm'),
+        ('patch_embed2.proj', 'patch_embedding2.patch_embed'),
+        ('patch_embed2.norm', 'patch_embedding2.norm'),
+        ('patch_embed3.proj', 'patch_embedding3.patch_embed'),
+        ('patch_embed3.norm', 'patch_embedding3.norm'),
+        ('patch_embed4.proj', 'patch_embedding4.patch_embed'),
+        ('patch_embed4.norm', 'patch_embedding4.norm'),
+        ('norm1', 'norm1'),        
+        ('norm2', 'norm2'),    
+        ('norm3', 'norm3'),    
+        ('norm4', 'norm4'),    
+    ]
+    
+    # torch 'layers' to  paddle 'stages'
+    depths = config.MODEL.TRANS.STAGE_DEPTHS
+    num_stages = len(depths)
+    for stage_idx in range(num_stages):
+        pp_s_prefix = f'block{stage_idx+1}'
+        th_s_prefix = f'block{stage_idx+1}'
+
+        if stage_idx==3:
+            for block_idx in range(depths[stage_idx]):
+                th_b_prefix = f'{th_s_prefix}.{block_idx}'
+                pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+                layer_mapping = [
+                    (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                    (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                    (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                    (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                    (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                    (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                    (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                    (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+                ]
+                mapping.extend(layer_mapping)   
+            break 
+            
+        for block_idx in range(depths[stage_idx]):
+            th_b_prefix = f'{th_s_prefix}.{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                (f'{th_b_prefix}.attn.sr', f'{pp_b_prefix}.attn.sr'),
+                (f'{th_b_prefix}.attn.norm', f'{pp_b_prefix}.attn.norm'),
+                (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+            ]
+            mapping.extend(layer_mapping)
+     
+    mapping.extend([('head', 'head')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_pvtv2(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    torch_model = pvt_v2_b2(pretrained=True)
+    pre=torch.load('./pvtv2_pth_models/pvt_v2_b2.pth')
+    torch_model.load_state_dict(pre)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    #x = np.ones((1, 3, 224, 224)).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-3)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./pvtv2_b2.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/PVTv2/port_weights/load_pytorch_weights_b2_linear.py b/image_classification/PVTv2/port_weights/load_pytorch_weights_b2_linear.py
new file mode 100644
index 00000000..b54ccea2
--- /dev/null
+++ b/image_classification/PVTv2/port_weights/load_pytorch_weights_b2_linear.py
@@ -0,0 +1,186 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from pvtv2_torch import *
+from pvtv2 import *
+from config import *
+
+
+config = get_config('./configs/pvtv2_b2_linear.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('patch_embed1.proj', 'patch_embedding1.patch_embed'),
+        ('patch_embed1.norm', 'patch_embedding1.norm'),
+        ('patch_embed2.proj', 'patch_embedding2.patch_embed'),
+        ('patch_embed2.norm', 'patch_embedding2.norm'),
+        ('patch_embed3.proj', 'patch_embedding3.patch_embed'),
+        ('patch_embed3.norm', 'patch_embedding3.norm'),
+        ('patch_embed4.proj', 'patch_embedding4.patch_embed'),
+        ('patch_embed4.norm', 'patch_embedding4.norm'),
+        ('norm1', 'norm1'),        
+        ('norm2', 'norm2'),    
+        ('norm3', 'norm3'),    
+        ('norm4', 'norm4'),    
+    ]
+    
+    # torch 'layers' to  paddle 'stages'
+    depths = config.MODEL.TRANS.STAGE_DEPTHS
+    num_stages = len(depths)
+    for stage_idx in range(num_stages):
+        pp_s_prefix = f'block{stage_idx+1}'
+        th_s_prefix = f'block{stage_idx+1}'
+
+        if stage_idx==3:
+            for block_idx in range(depths[stage_idx]):
+                th_b_prefix = f'{th_s_prefix}.{block_idx}'
+                pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+                layer_mapping = [
+                    (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                    (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                    (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                    (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                    (f'{th_b_prefix}.attn.sr', f'{pp_b_prefix}.attn.sr'),
+                    (f'{th_b_prefix}.attn.norm', f'{pp_b_prefix}.attn.norm'),
+                    (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                    (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                    (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                    (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+                ]
+                mapping.extend(layer_mapping)   
+            break 
+            
+        for block_idx in range(depths[stage_idx]):
+            th_b_prefix = f'{th_s_prefix}.{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                (f'{th_b_prefix}.attn.sr', f'{pp_b_prefix}.attn.sr'),
+                (f'{th_b_prefix}.attn.norm', f'{pp_b_prefix}.attn.norm'),
+                (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+            ]
+            mapping.extend(layer_mapping)
+     
+    mapping.extend([('head', 'head')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_pvtv2(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    torch_model = pvt_v2_b2_li(pretrained=True)
+    pre=torch.load('./pvtv2_pth_models/pvt_v2_b2_li.pth')
+    torch_model.load_state_dict(pre)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    #x = np.ones((1, 3, 224, 224)).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-3)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./pvtv2_b2_linear.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/PVTv2/port_weights/load_pytorch_weights_b3.py b/image_classification/PVTv2/port_weights/load_pytorch_weights_b3.py
new file mode 100644
index 00000000..3ca1c8f9
--- /dev/null
+++ b/image_classification/PVTv2/port_weights/load_pytorch_weights_b3.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from pvtv2_torch import *
+from pvtv2 import *
+from config import *
+
+
+config = get_config('./configs/pvtv2_b3.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('patch_embed1.proj', 'patch_embedding1.patch_embed'),
+        ('patch_embed1.norm', 'patch_embedding1.norm'),
+        ('patch_embed2.proj', 'patch_embedding2.patch_embed'),
+        ('patch_embed2.norm', 'patch_embedding2.norm'),
+        ('patch_embed3.proj', 'patch_embedding3.patch_embed'),
+        ('patch_embed3.norm', 'patch_embedding3.norm'),
+        ('patch_embed4.proj', 'patch_embedding4.patch_embed'),
+        ('patch_embed4.norm', 'patch_embedding4.norm'),
+        ('norm1', 'norm1'),        
+        ('norm2', 'norm2'),    
+        ('norm3', 'norm3'),    
+        ('norm4', 'norm4'),    
+    ]
+    
+    # torch 'layers' to  paddle 'stages'
+    depths = config.MODEL.TRANS.STAGE_DEPTHS
+    num_stages = len(depths)
+    for stage_idx in range(num_stages):
+        pp_s_prefix = f'block{stage_idx+1}'
+        th_s_prefix = f'block{stage_idx+1}'
+
+        if stage_idx==3:
+            for block_idx in range(depths[stage_idx]):
+                th_b_prefix = f'{th_s_prefix}.{block_idx}'
+                pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+                layer_mapping = [
+                    (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                    (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                    (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                    (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                    (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                    (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                    (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                    (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+                ]
+                mapping.extend(layer_mapping)   
+            break 
+            
+        for block_idx in range(depths[stage_idx]):
+            th_b_prefix = f'{th_s_prefix}.{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                (f'{th_b_prefix}.attn.sr', f'{pp_b_prefix}.attn.sr'),
+                (f'{th_b_prefix}.attn.norm', f'{pp_b_prefix}.attn.norm'),
+                (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+            ]
+            mapping.extend(layer_mapping)
+     
+    mapping.extend([('head', 'head')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_pvtv2(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    torch_model = pvt_v2_b3(pretrained=True)
+    pre=torch.load('./pvtv2_pth_models/pvt_v2_b3.pth')
+    torch_model.load_state_dict(pre)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    #x = np.ones((1, 3, 224, 224)).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-3)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./pvtv2_b3.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/PVTv2/port_weights/load_pytorch_weights_b4.py b/image_classification/PVTv2/port_weights/load_pytorch_weights_b4.py
new file mode 100644
index 00000000..538925c0
--- /dev/null
+++ b/image_classification/PVTv2/port_weights/load_pytorch_weights_b4.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from pvtv2_torch import *
+from pvtv2 import *
+from config import *
+
+
+config = get_config('./configs/pvtv2_b4.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('patch_embed1.proj', 'patch_embedding1.patch_embed'),
+        ('patch_embed1.norm', 'patch_embedding1.norm'),
+        ('patch_embed2.proj', 'patch_embedding2.patch_embed'),
+        ('patch_embed2.norm', 'patch_embedding2.norm'),
+        ('patch_embed3.proj', 'patch_embedding3.patch_embed'),
+        ('patch_embed3.norm', 'patch_embedding3.norm'),
+        ('patch_embed4.proj', 'patch_embedding4.patch_embed'),
+        ('patch_embed4.norm', 'patch_embedding4.norm'),
+        ('norm1', 'norm1'),        
+        ('norm2', 'norm2'),    
+        ('norm3', 'norm3'),    
+        ('norm4', 'norm4'),    
+    ]
+    
+    # torch 'layers' to  paddle 'stages'
+    depths = config.MODEL.TRANS.STAGE_DEPTHS
+    num_stages = len(depths)
+    for stage_idx in range(num_stages):
+        pp_s_prefix = f'block{stage_idx+1}'
+        th_s_prefix = f'block{stage_idx+1}'
+
+        if stage_idx==3:
+            for block_idx in range(depths[stage_idx]):
+                th_b_prefix = f'{th_s_prefix}.{block_idx}'
+                pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+                layer_mapping = [
+                    (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                    (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                    (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                    (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                    (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                    (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                    (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                    (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+                ]
+                mapping.extend(layer_mapping)   
+            break 
+            
+        for block_idx in range(depths[stage_idx]):
+            th_b_prefix = f'{th_s_prefix}.{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                (f'{th_b_prefix}.attn.sr', f'{pp_b_prefix}.attn.sr'),
+                (f'{th_b_prefix}.attn.norm', f'{pp_b_prefix}.attn.norm'),
+                (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+            ]
+            mapping.extend(layer_mapping)
+     
+    mapping.extend([('head', 'head')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_pvtv2(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    torch_model = pvt_v2_b4(pretrained=True)
+    pre=torch.load('./pvtv2_pth_models/pvt_v2_b4.pth')
+    torch_model.load_state_dict(pre)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    #x = np.ones((1, 3, 224, 224)).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-3)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./pvtv2_b4.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/PVTv2/port_weights/load_pytorch_weights_b5.py b/image_classification/PVTv2/port_weights/load_pytorch_weights_b5.py
new file mode 100644
index 00000000..fed57359
--- /dev/null
+++ b/image_classification/PVTv2/port_weights/load_pytorch_weights_b5.py
@@ -0,0 +1,185 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from pvtv2_torch import *
+from pvtv2 import *
+from config import *
+
+
+config = get_config('./configs/pvtv2_b5.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('patch_embed1.proj', 'patch_embedding1.patch_embed'),
+        ('patch_embed1.norm', 'patch_embedding1.norm'),
+        ('patch_embed2.proj', 'patch_embedding2.patch_embed'),
+        ('patch_embed2.norm', 'patch_embedding2.norm'),
+        ('patch_embed3.proj', 'patch_embedding3.patch_embed'),
+        ('patch_embed3.norm', 'patch_embedding3.norm'),
+        ('patch_embed4.proj', 'patch_embedding4.patch_embed'),
+        ('patch_embed4.norm', 'patch_embedding4.norm'),
+        ('norm1', 'norm1'),        
+        ('norm2', 'norm2'),    
+        ('norm3', 'norm3'),    
+        ('norm4', 'norm4'),    
+    ]
+    
+    # torch 'layers' to  paddle 'stages'
+    depths = config.MODEL.TRANS.STAGE_DEPTHS
+    num_stages = len(depths)
+    for stage_idx in range(num_stages):
+        pp_s_prefix = f'block{stage_idx+1}'
+        th_s_prefix = f'block{stage_idx+1}'
+
+        if stage_idx==3:
+            for block_idx in range(depths[stage_idx]):
+                th_b_prefix = f'{th_s_prefix}.{block_idx}'
+                pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+                layer_mapping = [
+                    (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                    (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                    (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                    (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                    (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                    (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                    (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                    (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+                ]
+                mapping.extend(layer_mapping)   
+            break 
+            
+        for block_idx in range(depths[stage_idx]):
+            th_b_prefix = f'{th_s_prefix}.{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                (f'{th_b_prefix}.attn.q', f'{pp_b_prefix}.attn.q'),
+                (f'{th_b_prefix}.attn.kv', f'{pp_b_prefix}.attn.kv'),
+                (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                (f'{th_b_prefix}.attn.sr', f'{pp_b_prefix}.attn.sr'),
+                (f'{th_b_prefix}.attn.norm', f'{pp_b_prefix}.attn.norm'),
+                (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                (f'{th_b_prefix}.mlp.dwconv.dwconv', f'{pp_b_prefix}.mlp.dwconv.dwconv'),                
+                (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+            ]
+            mapping.extend(layer_mapping)
+     
+    mapping.extend([('head', 'head')])
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_pvtv2(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+
+    device = torch.device('cpu')
+    # load weights from local
+    #torch_model = pvt_v2_b5(pretrained=False)
+    torch_model = pvt_v2_b5(pretrained=True)
+    pre=torch.load('./pvtv2_pth_models/pvt_v2_b5.pth')
+    torch_model.load_state_dict(pre)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    #x = np.ones((1, 3, 224, 224)).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-3)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./pvtv2_b5.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/PVTv2/pvtv2.png b/image_classification/PVTv2/pvtv2.png
new file mode 100644
index 00000000..d9eab43c
Binary files /dev/null and b/image_classification/PVTv2/pvtv2.png differ
diff --git a/image_classification/PVTv2/pvtv2.py b/image_classification/PVTv2/pvtv2.py
new file mode 100644
index 00000000..b1bdf12f
--- /dev/null
+++ b/image_classification/PVTv2/pvtv2.py
@@ -0,0 +1,435 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement Transformer Class for PVTv2
+"""
+
+import copy
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+
+
+class Identity(nn.Layer):                      
+    """ Identity layer
+
+    The output of this layer is the input without any change.
+    Use this layer to avoid if condition in some forward methods
+
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+ 
+    def forward(self, input):
+        return input
+
+
+class DWConv(nn.Layer):
+    """Depth-Wise convolution 3x3
+
+    Improve the local continuity of features.
+
+    """
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, _, C = x.shape
+        x = x.transpose([0,2,1]).reshape([B, C, H, W])
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose([0,2,1])
+
+        return x
+
+
+class OverlapPatchEmbedding(nn.Layer):
+    """Overlapping Patch Embedding
+
+    Apply Overlapping Patch Embedding on input images. Embeddings is implemented using a Conv2D op.
+    Making adjacent windows overlap by half of the area, and pad the feature map with zeros to keep 
+    the resolution.
+
+    Attributes:
+        image_size: int, input image size, default: 224
+        patch_size: int, size of patch, default: 7
+        in_channels: int, input image channels, default: 3
+        embed_dim: int, embedding dimension, default: 768
+    """
+
+    def __init__(self, image_size=224, patch_size=7, stride=4, in_channels=3, embed_dim=768):
+        super().__init__()
+        image_size = (image_size, image_size) # TODO: add to_2tuple
+        patch_size = (patch_size, patch_size)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.H, self.W = image_size[0] // patch_size[0], image_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+
+        self.patch_embed = nn.Conv2D(in_channels=in_channels, 
+                                     out_channels=embed_dim, 
+                                     kernel_size=patch_size, 
+                                     stride=stride,
+                                     padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.patch_embed(x) # [batch, embed_dim, h, w] h,w = patch_resolution
+        _, _, H, W = x.shape
+        x = x.flatten(start_axis=2, stop_axis=-1) # [batch, embed_dim, h*w] h*w = num_patches
+        x = x.transpose([0, 2, 1]) # [batch, h*w, embed_dim]
+        x = self.norm(x) # [batch, num_patches, embed_dim]
+
+        return x, H, W
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> dwconv -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        dwconv: Depth-Wise Convolution
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+
+    def __init__(self, in_features, hidden_features, dropout=0.0, linear=False):
+        super(Mlp, self).__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+        
+        self.dwconv = DWConv(hidden_features)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             in_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+        self.linear = linear
+        if self.linear:
+            self.relu = nn.ReLU()
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        if self.linear:
+            x = self.relu(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class Attention(nn.Layer):
+    """ Attention module
+
+    Attention module for PvT, here q, k, v are assumed the same.
+    The qkv mappings are stored as one single param.
+
+    Attributes:
+        dim: int, input dimension (channels)
+        num_heads: number of heads
+        q: a nn.Linear for q mapping
+        kv: a nn.Linear for kv mapping
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: True
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        attn_dropout: dropout for attention
+        proj_dropout: final dropout before output
+        softmax: softmax op for attention
+        linear: bool, if True, use linear spatial reduction attention instead of spatial reduction attention
+        sr_ratio: the spatial reduction ratio of SRA (linear spatial reduction attention)
+    """
+
+    def __init__(self, 
+                 dim, 
+                 num_heads, 
+                 qkv_bias=False, 
+                 qk_scale=None, 
+                 attention_dropout=0., 
+                 dropout=0., 
+                 sr_ratio=1, 
+                 linear=False):
+        """init Attention"""
+        super(Attention, self).__init__()
+        self.num_heads = num_heads
+        self.dim = dim
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head ** -0.5
+
+        self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+        self.linear = linear
+        self.sr_ratio = sr_ratio
+        if not linear:
+            if sr_ratio > 1:
+                self.sr = nn.Conv2D(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+                self.norm = nn.LayerNorm(dim, epsilon=1e-5)
+        else:
+            self.pool = nn.AdaptiveAvgPool2D(7)
+            self.sr = nn.Conv2D(dim, dim, kernel_size=1, stride=1)
+            self.norm = nn.LayerNorm(dim, epsilon=1e-5)
+            self.act = nn.GELU()
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+        
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape([B, N, self.num_heads, C // self.num_heads]).transpose([0, 2, 1, 3])
+
+        if not self.linear:
+            if self.sr_ratio > 1:
+                x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+                x_ = self.sr(x_).reshape([B, C, -1]).transpose([0, 2, 1])
+                x_ = self.norm(x_)
+                kv = self.kv(x_).reshape([B, -1, 2, self.num_heads, C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+            else:
+                kv = self.kv(x).reshape([B, -1, 2, self.num_heads, C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        else:
+            x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+            x_ = self.sr(self.pool(x_)).reshape([B, C, -1]).transpose([0, 2, 1])
+            x_ = self.norm(x_)
+            x_ = self.act(x_)
+            kv = self.kv(x_).reshape([B, -1, 2, self.num_heads, C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        k, v = kv[0], kv[1]
+
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = attn * self.scale
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 2, 1, 3])
+        new_shape = z.shape[:-2] + [self.dim]
+        z = z.reshape(new_shape)
+        z = self.proj(z)
+        z = self.proj_dropout(z)
+
+        return z
+
+
+class PvTv2Block(nn.Layer):
+    """Pyramid VisionTransformerV2 block
+
+    Contains multi head efficient self attention, droppath, mlp, norm.
+
+    Attributes:
+        dim: int, input dimension (channels)
+        num_heads: int, number of attention heads
+        mlp_ratio: float, ratio of mlp hidden dim and input embedding dim, default: 4.
+        sr_ratio: the spatial reduction ratio of SRA (linear spatial reduction attention)
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: True
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        dropout: float, dropout for output, default: 0.
+        attention_dropout: float, dropout of attention, default: 0.
+        drop_path: float, drop path rate, default: 0.
+    """
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, dropout=0., 
+                 attention_dropout=0., drop_path=0., sr_ratio=1, linear=False):
+        super(PvTv2Block, self).__init__()
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.attn = Attention(dim,
+                              num_heads=num_heads, 
+                              qkv_bias=qkv_bias, 
+                              qk_scale=qk_scale,
+                              attention_dropout=attention_dropout, 
+                              dropout=dropout, 
+                              sr_ratio=sr_ratio, 
+                              linear=linear)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.mlp = Mlp(in_features=dim, 
+                       hidden_features=int(dim*mlp_ratio), 
+                       dropout=dropout, 
+                       linear=linear)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+
+        return x
+
+
+class PyramidVisionTransformerV2(nn.Layer):
+    """PyramidVisionTransformerV2 class
+
+    Attributes:
+        patch_size: int, size of patch
+        image_size: int, size of image
+        num_classes: int, num of image classes
+        in_channels: int, channel of input image
+        num_heads: int, num of heads in attention module 
+        num_stages: int, num of stages contains OverlapPatch embedding and PvTv2 blocks      
+        depths: list of int, num of PvTv2 blocks in each stage
+        mlp_ratio: float, hidden dimension of mlp layer is mlp_ratio * mlp input dim
+        sr_ratio: the spatial reduction ratio of SRA (linear spatial reduction attention)      
+        qkv_bias: bool, if True, set qkv layers have bias enabled
+        qk_scale: float, scale factor for qk.
+        embed_dims: list of int, output dimension of patch embedding
+        dropout: float, dropout rate for linear layer
+        attention_dropout: float, dropout rate for attention
+        drop_path: float, drop path rate, default: 0.
+        linear: bool, if True, use linear spatial reduction attention instead of spatial reduction attention
+        patch_embedding: PatchEmbedding, patch embedding instance
+        norm: nn.LayerNorm, norm layer applied after transformer
+        fc: nn.Linear, classifier op.
+    """
+
+    def __init__(self,
+                 image_size=224,
+                 patch_size=4,
+                 embed_dims=[32, 64, 160, 256],
+                 num_classes=1000,
+                 in_channels=3,
+                 num_heads=[1, 2, 5, 8],
+                 depths=[2, 2, 2, 2],
+                 mlp_ratio=[8, 8, 4, 4],
+                 sr_ratio=[8, 4, 2, 1],
+                 qkv_bias=True,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 drop_path=0.,
+                 linear=False):
+        super(PyramidVisionTransformerV2, self).__init__()
+
+        self.patch_size = patch_size 
+        self.image_size = image_size
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.num_heads = num_heads
+        self.depths = depths
+        self.num_stages = len(self.depths)
+        self.mlp_ratio = mlp_ratio 
+        self.sr_ratio = sr_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.embed_dims = embed_dims
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout 
+        self.drop_path = drop_path
+        self.linear = linear
+
+        depth_decay = [x.item() for x in paddle.linspace(0, self.drop_path, sum(self.depths))]
+        cur = 0
+
+        for i in range(self.num_stages):
+            patch_embedding = OverlapPatchEmbedding(image_size=self.image_size if i == 0 else self.image_size // (2 ** (i + 1)),
+                                                patch_size=7 if i == 0 else 3,
+                                                stride=4 if i == 0 else 2,
+                                                in_channels=self.in_channels if i == 0 else self.embed_dims[i - 1],
+                                                embed_dim=self.embed_dims[i])
+
+            block = nn.LayerList([copy.deepcopy(PvTv2Block(
+                dim=self.embed_dims[i], num_heads=self.num_heads[i], mlp_ratio=self.mlp_ratio[i], qkv_bias=self.qkv_bias, 
+                qk_scale=self.qk_scale, dropout=self.dropout, attention_dropout=self.attention_dropout, 
+                drop_path=depth_decay[cur + j], sr_ratio=self.sr_ratio[i], linear=self.linear))
+                for j in range(self.depths[i])])
+            norm = nn.LayerNorm(self.embed_dims[i], epsilon=1e-6)
+            cur += self.depths[i]
+
+            setattr(self, f"patch_embedding{i + 1}", patch_embedding)
+            setattr(self, f"block{i + 1}", block)
+            setattr(self, f"norm{i + 1}", norm)
+
+        # classification head
+        self.head = nn.Linear(self.embed_dims[3], self.num_classes) if self.num_classes > 0 else Identity()
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+        
+    def freeze_patch_embedding(self):
+        self.patch_embedding1.requires_grad = False
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(self.num_stages):
+            patch_embedding = getattr(self, f"patch_embedding{i + 1}")
+            block = getattr(self, f"block{i + 1}")
+            norm = getattr(self, f"norm{i + 1}")
+            x, H, W = patch_embedding(x)
+
+            for idx, blk in enumerate(block):
+                x = blk(x, H, W)
+            x = norm(x)
+            if i != self.num_stages - 1:
+                x = x.reshape([B, H, W, -1]).transpose([0, 3, 1, 2])
+
+        return x.mean(axis=1)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+
+        return x
+
+
+def build_pvtv2(config):
+    model = PyramidVisionTransformerV2(
+        image_size=config.DATA.IMAGE_SIZE,
+        patch_size=config.MODEL.TRANS.PATCH_SIZE,
+        embed_dims=config.MODEL.TRANS.EMBED_DIMS,
+        num_classes=config.MODEL.NUM_CLASSES,
+        in_channels=config.MODEL.TRANS.IN_CHANNELS,
+        num_heads=config.MODEL.TRANS.NUM_HEADS,
+        depths=config.MODEL.TRANS.STAGE_DEPTHS,
+        mlp_ratio=config.MODEL.TRANS.MLP_RATIO,
+        sr_ratio=config.MODEL.TRANS.SR_RATIO,
+        qkv_bias=config.MODEL.TRANS.QKV_BIAS,
+        qk_scale=config.MODEL.TRANS.QK_SCALE,
+        dropout=config.MODEL.DROPOUT,
+        attention_dropout=config.MODEL.ATTENTION_DROPOUT,
+        drop_path=config.MODEL.DROP_PATH,
+        linear=config.MODEL.TRANS.LINEAR)
+    return model
diff --git a/image_classification/PVTv2/pvtv2_torch.py b/image_classification/PVTv2/pvtv2_torch.py
new file mode 100644
index 00000000..fa115c09
--- /dev/null
+++ b/image_classification/PVTv2/pvtv2_torch.py
@@ -0,0 +1,401 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from timm.models.vision_transformer import _cfg
+import math
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.linear = linear
+        if self.linear:
+            self.relu = nn.ReLU(inplace=True)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        if self.linear:
+            x = self.relu(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.linear = linear
+        self.sr_ratio = sr_ratio
+        if not linear:
+            if sr_ratio > 1:
+                self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+                self.norm = nn.LayerNorm(dim)
+        else:
+            self.pool = nn.AdaptiveAvgPool2d(7)
+            self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
+            self.norm = nn.LayerNorm(dim)
+            self.act = nn.GELU()
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        if not self.linear:
+            if self.sr_ratio > 1:
+                x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+                x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
+                x_ = self.norm(x_)
+                kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            else:
+                kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        else:
+            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1)
+            x_ = self.norm(x_)
+            x_ = self.act(x_)
+            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+
+
+        return x
+
+
+class OverlapPatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+
+        return x, H, W
+
+
+class PyramidVisionTransformerV2(nn.Module):
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
+                 attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], num_stages=4, linear=False):
+        super().__init__()
+        self.num_classes = num_classes
+        self.depths = depths
+        self.num_stages = num_stages
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        cur = 0
+
+        for i in range(num_stages):
+            patch_embed = OverlapPatchEmbed(img_size=img_size if i == 0 else img_size // (2 ** (i + 1)),
+                                            patch_size=7 if i == 0 else 3,
+                                            stride=4 if i == 0 else 2,
+                                            in_chans=in_chans if i == 0 else embed_dims[i - 1],
+                                            embed_dim=embed_dims[i])
+
+            block = nn.ModuleList([Block(
+                dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer,
+                sr_ratio=sr_ratios[i], linear=linear)
+                for j in range(depths[i])])
+            norm = norm_layer(embed_dims[i])
+            cur += depths[i]
+
+            setattr(self, f"patch_embed{i + 1}", patch_embed)
+            setattr(self, f"block{i + 1}", block)
+            setattr(self, f"norm{i + 1}", norm)
+
+        # classification head
+        self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def freeze_patch_emb(self):
+        self.patch_embed1.requires_grad = False
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'}  # has pos_embed may be better
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f"patch_embed{i + 1}")
+            block = getattr(self, f"block{i + 1}")
+            norm = getattr(self, f"norm{i + 1}")
+            x, H, W = patch_embed(x)
+
+            for idx, blk in enumerate(block):
+                x = blk(x, H, W)
+            x = norm(x)
+            if i != self.num_stages - 1:
+                x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+
+        return x.mean(dim=1)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+
+        return x
+
+
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+
+        return x
+
+
+def _conv_filter(state_dict, patch_size=16):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    for k, v in state_dict.items():
+        if 'patch_embed.proj.weight' in k:
+            v = v.reshape((v.shape[0], 3, patch_size, patch_size))
+        out_dict[k] = v
+
+    return out_dict
+
+
+@register_model
+def pvt_v2_b0(pretrained=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
+        norm_layer=nn.LayerNorm, depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    model.default_cfg = _cfg()
+
+    return model
+
+
+@register_model
+def pvt_v2_b1(pretrained=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    model.default_cfg = _cfg()
+
+    return model
+
+
+@register_model
+def pvt_v2_b2(pretrained=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], **kwargs)
+    model.default_cfg = _cfg()
+
+    return model
+
+
+@register_model
+def pvt_v2_b3(pretrained=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    model.default_cfg = _cfg()
+
+    return model
+
+
+@register_model
+def pvt_v2_b4(pretrained=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    model.default_cfg = _cfg()
+
+    return model
+
+
+@register_model
+def pvt_v2_b5(pretrained=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    model.default_cfg = _cfg()
+
+    return model
+
+
+@register_model
+def pvt_v2_b2_li(pretrained=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], linear=True, **kwargs)
+    model.default_cfg = _cfg()
+
+    return model
diff --git a/image_classification/PVTv2/run_eval.sh b/image_classification/PVTv2/run_eval.sh
new file mode 100644
index 00000000..cb1d1a10
--- /dev/null
+++ b/image_classification/PVTv2/run_eval.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/pvtv2_tiny_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=4 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./pvtv2_tiny_224'
diff --git a/image_classification/PVTv2/run_eval_multi.sh b/image_classification/PVTv2/run_eval_multi.sh
new file mode 100644
index 00000000..6a7b43db
--- /dev/null
+++ b/image_classification/PVTv2/run_eval_multi.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/pvtv2_b0.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./pvtv2_b0' \
+-ngpus=4
diff --git a/image_classification/PVTv2/run_eval_multi_b0.sh b/image_classification/PVTv2/run_eval_multi_b0.sh
new file mode 100644
index 00000000..7f296760
--- /dev/null
+++ b/image_classification/PVTv2/run_eval_multi_b0.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/pvtv2_b0.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./pvtv2_b0' \
+-ngpus=4
diff --git a/image_classification/PVTv2/run_eval_multi_b1.sh b/image_classification/PVTv2/run_eval_multi_b1.sh
new file mode 100644
index 00000000..0b96ff55
--- /dev/null
+++ b/image_classification/PVTv2/run_eval_multi_b1.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/pvtv2_b1.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./pvtv2_b1' \
+-ngpus=4
diff --git a/image_classification/PVTv2/run_eval_multi_b2.sh b/image_classification/PVTv2/run_eval_multi_b2.sh
new file mode 100644
index 00000000..4d96eaee
--- /dev/null
+++ b/image_classification/PVTv2/run_eval_multi_b2.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/pvtv2_b2.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./pvtv2_b2' \
+-ngpus=4
diff --git a/image_classification/PVTv2/run_eval_multi_b2_linear.sh b/image_classification/PVTv2/run_eval_multi_b2_linear.sh
new file mode 100644
index 00000000..a95122ea
--- /dev/null
+++ b/image_classification/PVTv2/run_eval_multi_b2_linear.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/pvtv2_b2_linear.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./pvtv2_b2_linear' \
+-ngpus=4
diff --git a/image_classification/PVTv2/run_eval_multi_b3.sh b/image_classification/PVTv2/run_eval_multi_b3.sh
new file mode 100644
index 00000000..2d1db65c
--- /dev/null
+++ b/image_classification/PVTv2/run_eval_multi_b3.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/pvtv2_b3.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./pvtv2_b3' \
+-ngpus=4
diff --git a/image_classification/PVTv2/run_eval_multi_b4.sh b/image_classification/PVTv2/run_eval_multi_b4.sh
new file mode 100644
index 00000000..79556b46
--- /dev/null
+++ b/image_classification/PVTv2/run_eval_multi_b4.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/pvtv2_b4.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./pvtv2_b4' \
+-ngpus=4
diff --git a/image_classification/PVTv2/run_eval_multi_b5.sh b/image_classification/PVTv2/run_eval_multi_b5.sh
new file mode 100644
index 00000000..b2adc5a1
--- /dev/null
+++ b/image_classification/PVTv2/run_eval_multi_b5.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/pvtv2_b5.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./pvtv2_b5' \
+-ngpus=4
diff --git a/image_classification/PVTv2/run_train.sh b/image_classification/PVTv2/run_train.sh
new file mode 100644
index 00000000..c9616488
--- /dev/null
+++ b/image_classification/PVTv2/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/pvtv2_b0.yaml' \
+-dataset='imagenet2012' \
+-batch_size=16 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/PVTv2/run_train_multi.sh b/image_classification/PVTv2/run_train_multi.sh
new file mode 100644
index 00000000..e780da73
--- /dev/null
+++ b/image_classification/PVTv2/run_train_multi.sh
@@ -0,0 +1,7 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/pvtv2_b0.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-ngpus=8
diff --git a/image_classification/PVTv2/tests/__init__.py b/image_classification/PVTv2/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/image_classification/PVTv2/tests/test_transformer.py b/image_classification/PVTv2/tests/test_transformer.py
new file mode 100644
index 00000000..afe9189b
--- /dev/null
+++ b/image_classification/PVTv2/tests/test_transformer.py
@@ -0,0 +1,92 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("..")
+
+from config import *
+from pvtv2 import *
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+import sys
+
+
+
+class PVTv2Test(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+        cls.config = get_config()
+        cls.dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        cls.dummy_tensor = paddle.to_tensor(cls.dummy_img)
+        cls.pvtv2 = build_pvtv2(cls.config)
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    # @unittest.skip('skip for debug') 
+    def test_out_shape(self):
+        logits = PVTv2Test.pvtv2(PVTv2Test.dummy_tensor)
+        self.assertEqual(logits.shape, [4, 1000])
+
+    # @unittest.skip('skip for debug') 
+    def test_all_parameters_updated(self):
+        optim = paddle.optimizer.SGD(parameters=PVTv2Test.pvtv2.parameters(), learning_rate=0.1)
+        out = PVTv2Test.pvtv2(PVTv2Test.dummy_tensor)
+        loss = out.mean()
+        loss.backward()
+        optim.step()
+
+        for name, param in PVTv2Test.pvtv2.named_parameters():
+            if not param.stop_gradient:
+                self.assertIsNotNone(param.gradient())
+                self.assertNotEqual(0, np.sum(param.gradient()**2))
+
+    @unittest.skip('skip for debug') 
+    def test_embeddings(self):
+        embed = OverlapPatchEmbedding(PVTv2Test.config)
+        dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+
+        patch_out = embed.patch_embed(dummy_tensor)
+        embed_out = embed(dummy_tensor)
+        self.assertEqual(patch_out.shape, [4, 768, 7, 7])
+        self.assertEqual(embed.cls_token.shape, [1, 1, 768])
+        self.assertEqual(embed_out.shape, [4, 50, 768])
+
+    @unittest.skip('skip for debug') 
+    def test_attention(self):
+        attn_op = Attention(PVTv2Test.config, num_heads=2)
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+
+        out, attn = attn_op(dummy_tensor)
+        self.assertEqual(attn.shape, [4, 12, 50, 50])
+        self.assertEqual(out.shape, [4, 50, 768])
+
+    @unittest.skip('skip for debug') 
+    def test_mlp(self):
+        mlp_op = Mlp(PVTv2Test.config, 768*4)
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+
+        out = mlp_op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+
+
+if __name__ == '__main__':
+    # verbosity=*：默认是1；设为0，则不输出每一个用例的执行结果；2-输出详细的执行结果
+    unittest.main(verbosity=2)
diff --git a/image_classification/PVTv2/utils.py b/image_classification/PVTv2/utils.py
new file mode 100644
index 00000000..f83d6994
--- /dev/null
+++ b/image_classification/PVTv2/utils.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+
+
+class MyPrint():
+    """" Print tensor and its shape, used for debug """
+    def __init__(self):
+        self.cnt = 0
+    def myprint(self, prefix, var, cnt=None, save=None):
+        """print tensor and its shape, optionly save to npy
+        Args:
+            prefix: str, print info in 1st and last lines
+            var: Tensor, tensor needs to print
+            cnt: int, if self.cnt is exceed this value, print will stop
+            save: str, file name (should end with .npy) to save the tensor, if None no save
+        """
+        if cnt is None or self.cnt < cnt:
+            print(f'------------ {prefix} ---------------')
+            print(var.shape, var)
+            print(f'------------ END {prefix} ---------------')
+            if save is not None:
+                var = var.numpy()
+                with open(save,'wb') as ofile:
+                    np.save(ofile, var)
+        self.cnt += 1
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/image_classification/README.md b/image_classification/README.md
new file mode 100644
index 00000000..52fa227a
--- /dev/null
+++ b/image_classification/README.md
@@ -0,0 +1,112 @@
+# PaddleViT-Classification: Visual Transformer and MLP Models for Image Classification
+PaddlePaddle training/validation code and pretrained models for **Image Classification**.
+
+This implementation is part of [PaddleViT](https://github.com/BR-IDL/PaddleViT.git) project.
+
+## Update 
+Update (2021-08-25): Init readme uploaded.
+
+## Quick Start
+
+ The following links are provided for the code and detail usage of each model architecture:
+1. **[ViT](./ViT)**
+2. **[DeiT](./DeiT)**
+3. **[Swin](./SwinTransformer)**
+4. **[VOLO](./VOLO)**
+5. **[CSwin](./CSwin)**
+6. **[CaiT](./CaiT)**
+7. **[PVTv2](./PVTv2)**
+8. **[Shuffle Transformer](./Shuffle_Transformer)**
+9. **[T2T-ViT](./T2T_ViT)**
+10. **[MLP-Mixer](./MLP-Mixer)**
+11. **[ResMLP](./ResMLP)**
+12. **[gMLP](./gMLP)**
+
+
+## Installation
+This module is tested on Python3.6+, and PaddlePaddle 2.1.0+. Most dependencies are installed by PaddlePaddle installation. You only need to install the following packages:
+```shell
+pip install yacs yaml
+```
+Then download the github repo:
+```shell
+git clone https://github.com/BR-IDL/PaddleViT.git
+cd PaddleViT/image_classification
+```
+
+## Basic Usage
+### Data Preparation
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+### Demo Example
+To use the model with pretrained weights, go to the specific subfolder, then download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.  
+
+Assume the downloaded weight file is stored in `./vit_base_patch16_224.pdparams`, to use the `vit_base_patch16_224` model in python:
+```python
+from config import get_config
+from visual_transformer import build_vit as build_model
+# config files in ./configs/
+config = get_config('./configs/vit_base_patch16_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./vit_base_patch16_224')
+model.set_dict(model_state_dict)
+```
+> :robot: See the README file in each model folder for detailed usages.
+
+## Basic Concepts
+PaddleViT image classification module is developed in separate folders for each model with similar structure. Each implementation is around 3 type of classes and 2 types of scripts:
+1. **Model classes** such as **[transformer.py](./ViT/transformer.py)**, in which the core *transformer model* and related methods are defined.
+   
+2. **Dataset classes** such as **[dataset.py](./ViT/datasets.py)**, in which the dataset, dataloader, data transforms are defined. We provided flexible implementations for you to customize the data loading scheme. Both single GPU and multi-GPU loading are supported.
+   
+3. **Config classes** such as **[config.py](./ViT/config.py)**, in which the model and training/validation configurations are defined. Usually, you don't need to change the items in the configuration, we provide updating configs by python `arguments` or `.yaml` config file. You can see [here](../docs/ppvit-config.md) for details of our configuration design and usage.
+   
+4. **main scripts** such as **[main_single_gpu.py](./ViT/main_single_gpu.py)**, in which the whole training/validation procedures are defined. The major steps of training or validation are provided, such as logging, loading/saving models, finetuning, etc. Multi-GPU is also supported and implemented in separate python script `main_multi_gpu.py`.
+   
+5. **run scripts** such as **[run_eval_base_224.sh](./ViT/run_eval_base_224.sh)**, in which the shell command for running python script with specific configs and arguments are defined.
+   
+
+## Model Architectures
+
+PaddleViT now provides the following **transfomer based models**:
+1. **[ViT](./ViT)** (from Google), released with paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929), by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+2. **[DeiT](./DeiT)** (from Facebook and Sorbonne), released with paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877), by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+3. **[Swin Transformer](./SwinTransformer)** (from Microsoft), released with paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030), by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+4. **[VOLO](./VOLO)** (from Sea AI Lab and NUS), released with paper [VOLO: Vision Outlooker for Visual Recognition](https://arxiv.org/abs/2106.13112), by Li Yuan, Qibin Hou, Zihang Jiang, Jiashi Feng, Shuicheng Yan.
+5. **[CSwin Transformer](./CSwin)** (from USTC and Microsoft), released with paper [CSWin Transformer: A General Vision Transformer Backbone with Cross-Shaped Windows
+](https://arxiv.org/abs/2107.00652), by Xiaoyi Dong, Jianmin Bao, Dongdong Chen, Weiming Zhang, Nenghai Yu, Lu Yuan, Dong Chen, Baining Guo.
+6. **[CaiT](./CaiT)** (from Facebook and Sorbonne), released with paper [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239), by Hugo Touvron, Matthieu Cord, Alexandre Sablayrolles, Gabriel Synnaeve, Hervé Jégou.
+7. **[PVTv2](./PVTv2)** (from NJU/HKU/NJUST/IIAI/SenseTime), released with paper [PVTv2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797), by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
+8. **[Shuffle Transformer](./Shuffle_Transformer)** (from Tencent), released with paper [Shuffle Transformer: Rethinking Spatial Shuffle for Vision Transformer](https://arxiv.org/abs/2106.03650), by Zilong Huang, Youcheng Ben, Guozhong Luo, Pei Cheng, Gang Yu, Bin Fu.
+9. **[T2T-ViT](./T2T_ViT)** (from NUS and YITU), released with paper [Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet
+](https://arxiv.org/abs/2101.11986), by Li Yuan, Yunpeng Chen, Tao Wang, Weihao Yu, Yujun Shi, Zihang Jiang, Francis EH Tay, Jiashi Feng, Shuicheng Yan.
+
+PaddleViT now provides the following **MLP based models**:
+1. **[MLP-Mixer](./MLP-Mixer)** (from Google), released with paper [MLP-Mixer: An all-MLP Architecture for Vision](https://arxiv.org/abs/2105.01601), by Ilya Tolstikhin, Neil Houlsby, Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Thomas Unterthiner, Jessica Yung, Andreas Steiner, Daniel Keysers, Jakob Uszkoreit, Mario Lucic, Alexey Dosovitskiy
+2. **[ResMLP](./ResMLP)** (from Facebook/Sorbonne/Inria/Valeo), released with paper [ResMLP: Feedforward networks for image classification with data-efficient training](https://arxiv.org/abs/2105.03404), by Hugo Touvron, Piotr Bojanowski, Mathilde Caron, Matthieu Cord, Alaaeldin El-Nouby, Edouard Grave, Gautier Izacard, Armand Joulin, Gabriel Synnaeve, Jakob Verbeek, Hervé Jégou.
+3. **[gMLP](./gMLP)** (from Google), released with paper [Pay Attention to MLPs](https://arxiv.org/abs/2105.08050), by Hanxiao Liu, Zihang Dai, David R. So, Quoc V. Le.
+
+#### Coming Soon: ####
+1. **[CrossViT]()** (from IBM), released with paper [CrossViT: Cross-Attention Multi-Scale Vision Transformer for Image Classification](https://arxiv.org/abs/2103.14899), by Chun-Fu Chen, Quanfu Fan, Rameswar Panda.
+2. **[Focal Transformer]()** (from Microsoft), released with paper [Focal Self-attention for Local-Global Interactions in Vision Transformers](https://arxiv.org/abs/2107.00641), by Jianwei Yang, Chunyuan Li, Pengchuan Zhang, Xiyang Dai, Bin Xiao, Lu Yuan and Jianfeng Gao.
+3. **[HaloNet]()**, (from Google), released with paper [Scaling Local Self-Attention for Parameter Efficient Visual Backbones](https://arxiv.org/abs/2103.12731), by Ashish Vaswani, Prajit Ramachandran, Aravind Srinivas, Niki Parmar, Blake Hechtman, Jonathon Shlens.
+
+
+## Contact
+If you have any questions, please create an [issue](https://github.com/BR-IDL/PaddleViT/issues) on our Github.
diff --git a/image_classification/ResMLP/README.md b/image_classification/ResMLP/README.md
new file mode 100644
index 00000000..b52ea5db
--- /dev/null
+++ b/image_classification/ResMLP/README.md
@@ -0,0 +1,170 @@
+# ResMLP: Feedforward networks for image classification with data-efficient training, [arxiv](https://arxiv.org/abs/2105.03404) 
+
+PaddlePaddle training/validation code and pretrained models for **ResMLP**.
+
+The official and 3rd party pytorch implementation are [here](https://github.com/facebookresearch/deit) and [here](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/mlp_mixer.py).
+
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./resmlp.png" alt="drawing" width="90%" height="90%"/>
+    <h4 align="center">ResMLP Model Overview</h4>
+</p>
+
+
+### Update 
+Update (2021-08-11): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| resmlp_24_224                  | 79.38 | 94.55 | 224        | 0.875    | bicubic       | [google](https://drive.google.com/file/d/15A5q1XSXBz-y1AcXhy_XaDymLLj2s2Tn/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1nLAvyG53REdwYNCLmp4yBA)(jdcx) |
+| resmlp_36_224             | 79.77 | 94.89 | 224        | 0.875      | bicubic      | [google](https://drive.google.com/file/d/1WrhVm-7EKnLmPU18Xm0C7uIqrg-RwqZL/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1QD4EWmM9b2u1r8LsnV6rUA)(33w3) |
+| resmlp_big_24_224         | 81.04 | 95.02 | 224        | 0.875      | bicubic      | [google](https://drive.google.com/file/d/1KLlFuzYb17tC5Mmue3dfyr2L_q4xHTZi/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1oXU6CR0z7O0XNwu_UdZv_w)(r9kb) |
+| resmlp_big_24_distilled_224 | 83.59 | 96.65 | 224        | 0.875      | bicubic      | [google](https://drive.google.com/file/d/199q0MN_BlQh9-HbB28RdxHj1ApMTHow-/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1yUrfbqW8vLODDiRV5WWkhQ)(4jk5) |
+
+> *The results are evaluated on ImageNet2012 validation set.
+> 
+> Note: ResMLP weights are ported from [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/mlp_mixer.py)
+
+
+
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./resmlp_24_224.pdparams`, to use the `resmlp_24_224` model in python:
+```python
+from config import get_config
+from resmlp import build_res_mlp as build_model
+# config files in ./configs/
+config = get_config('./configs/resmlp_24_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./resmlp_24_224')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate ResMLP model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/resmlp_24_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./resmlp_24_224'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/resmlp_24_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./resmlp_24_224'
+```
+
+</details>
+
+## Training
+To train the ResMLP Transformer model on ImageNet2012 with single GPUs, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/resmlp_24_224.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=32 \
+  -data_path='/dataset/imagenet' \
+```
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/resmlp_24_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \ 
+```
+
+</details>
+
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@article{touvron2021resmlp,
+  title={Resmlp: Feedforward networks for image classification with data-efficient training},
+  author={Touvron, Hugo and Bojanowski, Piotr and Caron, Mathilde and Cord, Matthieu and El-Nouby, Alaaeldin and Grave, Edouard and Joulin, Armand and Synnaeve, Gabriel and Verbeek, Jakob and J{\'e}gou, Herv{\'e}},
+  journal={arXiv preprint arXiv:2105.03404},
+  year={2021}
+}
+```
diff --git a/image_classification/ResMLP/config.py b/image_classification/ResMLP/config.py
new file mode 100644
index 00000000..3ab6abf0
--- /dev/null
+++ b/image_classification/ResMLP/config.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+
+"""
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 256 #256 # train batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #64 # val batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size: 224 for pretrain, 384 for finetune
+_C.DATA.CROP_PCT = 1.0 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'ResMLP'
+_C.MODEL.NAME = 'ResMLP'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.1
+_C.MODEL.DROPPATH = 0.1
+
+# transformer settings
+_C.MODEL.MIXER = CN()
+_C.MODEL.MIXER.PATCH_SIZE = 16
+_C.MODEL.MIXER.HIDDEN_SIZE = 384
+_C.MODEL.MIXER.NUM_LAYERS = 24
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size
+_C.TRAIN.WEIGHT_DECAY = 0.01 #0.3 # 0.0 for finetune
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0
+_C.TRAIN.END_LR = 1e-5
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 20 # freq to save chpt
+_C.REPORT_FREQ = 50 # freq to logging info
+_C.VALIDATE_FREQ = 20 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = 1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/ResMLP/configs/resmlp_24_224.yaml b/image_classification/ResMLP/configs/resmlp_24_224.yaml
new file mode 100644
index 00000000..3be23d37
--- /dev/null
+++ b/image_classification/ResMLP/configs/resmlp_24_224.yaml
@@ -0,0 +1,11 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: ResMLP
+    NAME: resmlp_24_224
+    MIXER:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 384
+        NUM_LAYERS: 24
+
diff --git a/image_classification/ResMLP/configs/resmlp_36_224.yaml b/image_classification/ResMLP/configs/resmlp_36_224.yaml
new file mode 100644
index 00000000..c27f5041
--- /dev/null
+++ b/image_classification/ResMLP/configs/resmlp_36_224.yaml
@@ -0,0 +1,11 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: ResMLP
+    NAME: resmlp_36_224
+    MIXER:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 384
+        NUM_LAYERS: 36
+
diff --git a/image_classification/ResMLP/configs/resmlp_big_24_224.yaml b/image_classification/ResMLP/configs/resmlp_big_24_224.yaml
new file mode 100644
index 00000000..ce63a9b0
--- /dev/null
+++ b/image_classification/ResMLP/configs/resmlp_big_24_224.yaml
@@ -0,0 +1,11 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: ResMLP
+    NAME: resmlp_big_24_224
+    MIXER:
+        PATCH_SIZE: 8
+        HIDDEN_SIZE: 768
+        NUM_LAYERS: 24
+
diff --git a/image_classification/ResMLP/datasets.py b/image_classification/ResMLP/datasets.py
new file mode 100644
index 00000000..a52d9fe3
--- /dev/null
+++ b/image_classification/ResMLP/datasets.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/ResMLP/droppath.py b/image_classification/ResMLP/droppath.py
new file mode 100644
index 00000000..fcff05e9
--- /dev/null
+++ b/image_classification/ResMLP/droppath.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import paddle
+import paddle.nn as nn
+
+def drop_path(inputs, drop_prob=0., training=False):
+    """drop path op
+    Args:
+        input: tensor with arbitrary shape
+        drop_prob: float number of drop path probability, default: 0.0
+        training: bool, if current mode is training, default: False
+    Returns:
+        output: output tensor after drop path
+    """
+    # if prob is 0 or eval mode, return original input
+    if drop_prob == 0. or not training:
+        return inputs
+    keep_prob = 1 - drop_prob
+    shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+    random_tensor = random_tensor.floor() # mask
+    output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation
+    return output
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, inputs):
+        return drop_path(inputs, self.drop_prob, self.training)
diff --git a/image_classification/ResMLP/main_multi_gpu.py b/image_classification/ResMLP/main_multi_gpu.py
new file mode 100644
index 00000000..6dd1b915
--- /dev/null
+++ b/image_classification/ResMLP/main_multi_gpu.py
@@ -0,0 +1,365 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ResMLP training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader, get_dataset
+from resmlp import build_res_mlp as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('ResMLP')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn([
+            #    'absolute_pos_embed', 'relative_position_bias_table']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/ResMLP/main_single_gpu.py b/image_classification/ResMLP/main_single_gpu.py
new file mode 100644
index 00000000..f50ed7b6
--- /dev/null
+++ b/image_classification/ResMLP/main_single_gpu.py
@@ -0,0 +1,334 @@
+
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ResMLP training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from resmlp import build_res_mlp as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('ResMLP')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+    # 1. Create model
+    model = build_model(config)
+    #model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  )
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/ResMLP/port_weights/load_pytorch_weights.py b/image_classification/ResMLP/port_weights/load_pytorch_weights.py
new file mode 100644
index 00000000..08da2cad
--- /dev/null
+++ b/image_classification/ResMLP/port_weights/load_pytorch_weights.py
@@ -0,0 +1,166 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from resmlp import build_res_mlp
+from config import get_config
+from config import update_config
+
+config = get_config('./configs/resmlp_24_224.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stem.proj', 'patch_embed.patch_embed'),
+        ('norm.alpha', 'norm.alpha'),
+        ('norm.beta', 'norm.beta'),
+        ('head', 'head'),
+    ]
+
+    num_layers = 24
+    for idx in range(num_layers):
+        pp_prefix = f'mixer_layers.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.ls1', f'{pp_prefix}.ls1'),
+            (f'{th_prefix}.ls2', f'{pp_prefix}.ls2'),
+            (f'{th_prefix}.norm1.alpha', f'{pp_prefix}.norm1.alpha'),
+            (f'{th_prefix}.norm1.beta', f'{pp_prefix}.norm1.beta'),
+            (f'{th_prefix}.norm2.alpha', f'{pp_prefix}.norm2.alpha'),
+            (f'{th_prefix}.norm2.beta', f'{pp_prefix}.norm2.beta'),
+            (f'{th_prefix}.linear_tokens', f'{pp_prefix}.linear_tokens'),
+            (f'{th_prefix}.mlp_channels.fc1', f'{pp_prefix}.mlp_channels.fc1'),
+            (f'{th_prefix}.mlp_channels.fc2', f'{pp_prefix}.mlp_channels.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'**SET** {th_name} {th_shape} **TO** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_res_mlp(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('resmlp_24_224', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0:100])
+    print(out_paddle[0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./resmlp_24_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+    print('all done')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/ResMLP/port_weights/load_pytorch_weights_36.py b/image_classification/ResMLP/port_weights/load_pytorch_weights_36.py
new file mode 100644
index 00000000..ce026bf4
--- /dev/null
+++ b/image_classification/ResMLP/port_weights/load_pytorch_weights_36.py
@@ -0,0 +1,166 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from resmlp import build_res_mlp
+from config import get_config
+from config import update_config
+
+config = get_config('./configs/resmlp_36_224.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stem.proj', 'patch_embed.patch_embed'),
+        ('norm.alpha', 'norm.alpha'),
+        ('norm.beta', 'norm.beta'),
+        ('head', 'head'),
+    ]
+
+    num_layers = 36
+    for idx in range(num_layers):
+        pp_prefix = f'mixer_layers.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.ls1', f'{pp_prefix}.ls1'),
+            (f'{th_prefix}.ls2', f'{pp_prefix}.ls2'),
+            (f'{th_prefix}.norm1.alpha', f'{pp_prefix}.norm1.alpha'),
+            (f'{th_prefix}.norm1.beta', f'{pp_prefix}.norm1.beta'),
+            (f'{th_prefix}.norm2.alpha', f'{pp_prefix}.norm2.alpha'),
+            (f'{th_prefix}.norm2.beta', f'{pp_prefix}.norm2.beta'),
+            (f'{th_prefix}.linear_tokens', f'{pp_prefix}.linear_tokens'),
+            (f'{th_prefix}.mlp_channels.fc1', f'{pp_prefix}.mlp_channels.fc1'),
+            (f'{th_prefix}.mlp_channels.fc2', f'{pp_prefix}.mlp_channels.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'**SET** {th_name} {th_shape} **TO** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_res_mlp(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('resmlp_36_224', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0:100])
+    print(out_paddle[0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./resmlp_36_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+    print('all done')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/ResMLP/port_weights/load_pytorch_weights_big_24.py b/image_classification/ResMLP/port_weights/load_pytorch_weights_big_24.py
new file mode 100644
index 00000000..34135f0e
--- /dev/null
+++ b/image_classification/ResMLP/port_weights/load_pytorch_weights_big_24.py
@@ -0,0 +1,166 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from resmlp import build_res_mlp
+from config import get_config
+from config import update_config
+
+config = get_config('./configs/resmlp_big_24_224.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stem.proj', 'patch_embed.patch_embed'),
+        ('norm.alpha', 'norm.alpha'),
+        ('norm.beta', 'norm.beta'),
+        ('head', 'head'),
+    ]
+
+    num_layers = 24
+    for idx in range(num_layers):
+        pp_prefix = f'mixer_layers.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.ls1', f'{pp_prefix}.ls1'),
+            (f'{th_prefix}.ls2', f'{pp_prefix}.ls2'),
+            (f'{th_prefix}.norm1.alpha', f'{pp_prefix}.norm1.alpha'),
+            (f'{th_prefix}.norm1.beta', f'{pp_prefix}.norm1.beta'),
+            (f'{th_prefix}.norm2.alpha', f'{pp_prefix}.norm2.alpha'),
+            (f'{th_prefix}.norm2.beta', f'{pp_prefix}.norm2.beta'),
+            (f'{th_prefix}.linear_tokens', f'{pp_prefix}.linear_tokens'),
+            (f'{th_prefix}.mlp_channels.fc1', f'{pp_prefix}.mlp_channels.fc1'),
+            (f'{th_prefix}.mlp_channels.fc2', f'{pp_prefix}.mlp_channels.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'**SET** {th_name} {th_shape} **TO** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_res_mlp(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('resmlp_big_24_224', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0:100])
+    print(out_paddle[0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./resmlp_big_24_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+    print('all done')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/ResMLP/port_weights/load_pytorch_weights_big_24_distilled.py b/image_classification/ResMLP/port_weights/load_pytorch_weights_big_24_distilled.py
new file mode 100644
index 00000000..b834dd1e
--- /dev/null
+++ b/image_classification/ResMLP/port_weights/load_pytorch_weights_big_24_distilled.py
@@ -0,0 +1,166 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from resmlp import build_res_mlp
+from config import get_config
+from config import update_config
+
+config = get_config('./configs/resmlp_big_24_224.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stem.proj', 'patch_embed.patch_embed'),
+        ('norm.alpha', 'norm.alpha'),
+        ('norm.beta', 'norm.beta'),
+        ('head', 'head'),
+    ]
+
+    num_layers = 24
+    for idx in range(num_layers):
+        pp_prefix = f'mixer_layers.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.ls1', f'{pp_prefix}.ls1'),
+            (f'{th_prefix}.ls2', f'{pp_prefix}.ls2'),
+            (f'{th_prefix}.norm1.alpha', f'{pp_prefix}.norm1.alpha'),
+            (f'{th_prefix}.norm1.beta', f'{pp_prefix}.norm1.beta'),
+            (f'{th_prefix}.norm2.alpha', f'{pp_prefix}.norm2.alpha'),
+            (f'{th_prefix}.norm2.beta', f'{pp_prefix}.norm2.beta'),
+            (f'{th_prefix}.linear_tokens', f'{pp_prefix}.linear_tokens'),
+            (f'{th_prefix}.mlp_channels.fc1', f'{pp_prefix}.mlp_channels.fc1'),
+            (f'{th_prefix}.mlp_channels.fc2', f'{pp_prefix}.mlp_channels.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'**SET** {th_name} {th_shape} **TO** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_res_mlp(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('resmlp_big_24_distilled_224', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0:100])
+    print(out_paddle[0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./resmlp_big_24_distilled_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+    print('all done')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/ResMLP/resmlp.png b/image_classification/ResMLP/resmlp.png
new file mode 100644
index 00000000..d794a531
Binary files /dev/null and b/image_classification/ResMLP/resmlp.png differ
diff --git a/image_classification/ResMLP/resmlp.py b/image_classification/ResMLP/resmlp.py
new file mode 100644
index 00000000..2f83ea9a
--- /dev/null
+++ b/image_classification/ResMLP/resmlp.py
@@ -0,0 +1,213 @@
+import math
+import copy
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embeddings
+
+    Apply patch embeddings on input images. Embeddings is implemented using a Conv2D op.
+
+    Attributes:
+        image_size: int, input image size, default: 224
+        patch_size: int, size of patch, default: 4
+        in_channels: int, input image channels, default: 3
+        embed_dim: int, embedding dimension, default: 96
+    """
+
+    def __init__(self, image_size=224, patch_size=4, in_channels=3, embed_dim=96, norm_layer=None):
+        super(PatchEmbedding, self).__init__()
+        image_size = (image_size, image_size)
+        patch_size = (patch_size, patch_size)
+        patches_resolution = [image_size[0]//patch_size[0], image_size[1]//patch_size[1]]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        self.patch_embed = nn.Conv2D(in_channels=in_channels,
+                                     out_channels=embed_dim,
+                                     kernel_size=patch_size,
+                                     stride=patch_size)
+        self.norm = norm_layer if norm_layer is not None else Identity()
+
+    def forward(self, x):
+        x = self.patch_embed(x) # [batch, embed_dim, h, w] h,w = patch_resolution
+        x = x.flatten(start_axis=2, stop_axis=-1) # [batch, embed_dim, h*w] h*w = num_patches
+        x = x.transpose([0, 2, 1]) # [batch, h*w, embed_dim]
+        x = self.norm(x) # [batch, num_patches, embed_dim]
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+    
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+    
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+    
+    def __init__(self, in_features, hidden_features, dropout):
+        super(Mlp, self).__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+    
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             in_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+    
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+    
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class ResBlock(nn.Layer):
+    def __init__(self, dim, seq_len, mlp_ratio=4, init_values=1e-5, dropout=0., droppath=0.):
+        super(ResBlock, self).__init__()
+        channels_dim = int(mlp_ratio * dim)
+        self.norm1 = Affine(dim)
+        self.linear_tokens = nn.Linear(seq_len, seq_len)
+        self.drop_path = DropPath(droppath)
+        self.norm2 = Affine(dim)
+        self.mlp_channels = Mlp(dim, channels_dim, dropout=dropout)
+
+        self.ls1 = paddle.create_parameter(
+            shape=[dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(init_values))
+
+        self.ls2 = paddle.create_parameter(
+            shape=[dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(init_values))
+
+    def forward(self, x):
+        h = x
+        x = self.norm1(x)
+        x = x.transpose([0, 2, 1])
+        x = self.linear_tokens(x)
+        x = x.transpose([0, 2, 1])
+        x = self.ls1 * x
+        x = self.drop_path(x)
+        x = x + h
+        
+        h = x
+        x = self.norm2(x)
+        x = self.mlp_channels(x)
+        x = self.ls2 * x
+        x = self.drop_path(x)
+        x = x + h
+
+        return x
+
+
+class Affine(nn.Layer):
+    def __init__(self, dim):
+        super(Affine, self).__init__()
+        self.alpha = paddle.create_parameter(
+            shape=[1, 1, dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(1))
+
+        self.beta = paddle.create_parameter(
+            shape=[1, 1, dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0))
+
+    def forward(self, x):
+        x = paddle.multiply(self.alpha, x)
+        x = self.beta + x
+        return x
+
+
+class ResMlp(nn.Layer):
+    def __init__(self,
+                 num_classes=1000,
+                 image_size=224,
+                 in_channels=3,
+                 patch_size=16,
+                 num_mixer_layers=24,
+                 embed_dim=384,
+                 mlp_ratio=4,
+                 dropout=0.,
+                 droppath=0.,
+                 patch_embed_norm=False):
+        super(ResMlp, self).__init__()
+        self.num_classes = num_classes
+        self.num_features = embed_dim
+        self.embed_dim = embed_dim
+
+        norm_layer=nn.LayerNorm(embed_dim, epsilon=1e-6)
+        self.patch_embed = PatchEmbedding(
+            image_size=image_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if patch_embed_norm else None)
+
+        self.mixer_layers = nn.Sequential(
+            *[ResBlock(embed_dim,
+                         self.patch_embed.num_patches,
+                         mlp_ratio,
+                         dropout,
+                         droppath) for _ in range(num_mixer_layers)])
+
+        self.norm = Affine(embed_dim)
+        self.head = nn.Linear(embed_dim, self.num_classes)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = self.mixer_layers(x)
+        x = self.norm(x)
+        x = x.mean(axis=1)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def build_res_mlp(config):
+    model = ResMlp(num_classes=config.MODEL.NUM_CLASSES,
+                   image_size=config.DATA.IMAGE_SIZE,
+                   patch_size=config.MODEL.MIXER.PATCH_SIZE,
+                   in_channels=3,
+                   num_mixer_layers=config.MODEL.MIXER.NUM_LAYERS,
+                   embed_dim=config.MODEL.MIXER.HIDDEN_SIZE,
+                   mlp_ratio=4,
+                   dropout=config.MODEL.DROPOUT,
+                   droppath=config.MODEL.DROPPATH)
+    return model
diff --git a/image_classification/ResMLP/run_eval.sh b/image_classification/ResMLP/run_eval.sh
new file mode 100644
index 00000000..5be25f33
--- /dev/null
+++ b/image_classification/ResMLP/run_eval.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/resmlp_24_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./resmlp_24_224'
diff --git a/image_classification/ResMLP/run_eval_36.sh b/image_classification/ResMLP/run_eval_36.sh
new file mode 100644
index 00000000..2412726f
--- /dev/null
+++ b/image_classification/ResMLP/run_eval_36.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/resmlp_36_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./resmlp_36_224' \
diff --git a/image_classification/ResMLP/run_eval_multi.sh b/image_classification/ResMLP/run_eval_multi.sh
new file mode 100644
index 00000000..fdefa3e0
--- /dev/null
+++ b/image_classification/ResMLP/run_eval_multi.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/resmlp_24_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./resmlp_24_224' \
+-ngpus=4
diff --git a/image_classification/ResMLP/run_eval_multi_36.sh b/image_classification/ResMLP/run_eval_multi_36.sh
new file mode 100644
index 00000000..bd0fb93b
--- /dev/null
+++ b/image_classification/ResMLP/run_eval_multi_36.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1 \
+python main_multi_gpu.py \
+-cfg='./configs/resmlp_36_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./resmlp_36_224' \
diff --git a/image_classification/ResMLP/run_eval_multi_big_24.sh b/image_classification/ResMLP/run_eval_multi_big_24.sh
new file mode 100644
index 00000000..2bd1ad9f
--- /dev/null
+++ b/image_classification/ResMLP/run_eval_multi_big_24.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/resmlp_big_24_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./resmlp_big_24_224' \
+-ngpus=4
diff --git a/image_classification/ResMLP/run_eval_multi_big_24_distilled.sh b/image_classification/ResMLP/run_eval_multi_big_24_distilled.sh
new file mode 100644
index 00000000..1e634604
--- /dev/null
+++ b/image_classification/ResMLP/run_eval_multi_big_24_distilled.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/resmlp_big_24_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./resmlp_big_24_distilled_224' \
+-ngpus=4
diff --git a/image_classification/ResMLP/run_train.sh b/image_classification/ResMLP/run_train.sh
new file mode 100644
index 00000000..8ac87545
--- /dev/null
+++ b/image_classification/ResMLP/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/resmlp_24_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/ResMLP/run_train_multi.sh b/image_classification/ResMLP/run_train_multi.sh
new file mode 100644
index 00000000..21b8f546
--- /dev/null
+++ b/image_classification/ResMLP/run_train_multi.sh
@@ -0,0 +1,7 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/resmlp_24_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-ngpus=4
diff --git a/image_classification/ResMLP/tests/__init__.py b/image_classification/ResMLP/tests/__init__.py
new file mode 100644
index 00000000..84952a81
--- /dev/null
+++ b/image_classification/ResMLP/tests/__init__.py
@@ -0,0 +1 @@
+# init
\ No newline at end of file
diff --git a/image_classification/ResMLP/tests/test_resmlp.py b/image_classification/ResMLP/tests/test_resmlp.py
new file mode 100644
index 00000000..341b7360
--- /dev/null
+++ b/image_classification/ResMLP/tests/test_resmlp.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+from config import *
+from resmlp import Identity
+from resmlp import PatchEmbedding
+from resmlp import ResMlp
+from resmlp import Mlp
+from resmlp import ResBlock
+from resmlp import Affine
+from resmlp import build_res_mlp
+
+
+class MlpTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+        cls.config = get_config()
+        cls.dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        cls.dummy_tensor = paddle.to_tensor(cls.dummy_img)
+        cls.model = build_res_mlp(cls.config)
+
+    @classmethod
+    def tearDown(cls):
+        pass
+    
+    #@unittest.skip('skip for debug')
+    def test_out_shape(self):
+        out = MlpTest.model(MlpTest.dummy_tensor)
+        self.assertEqual(out.shape, [4, 1000])
+
+    #@unittest.skip('skip for debug')
+    def test_all_parameters_updated(self):
+        optim = paddle.optimizer.SGD(
+            parameters=MlpTest.model.parameters(), learning_rate=0.1)
+        out = MlpTest.model(MlpTest.dummy_tensor)
+        loss = out.mean()
+        loss.backward()
+        optim.step()
+    
+        for name, param in MlpTest.model.named_parameters():
+            if not param.stop_gradient:
+                self.assertIsNotNone(param.gradient())
+                self.assertNotEqual(0, np.sum(param.gradient()**2))
+    
+    #@unittest.skip('skip for debug')
+    def test_embeddings(self):
+        embed = PatchEmbedding(embed_dim=768)
+        dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+    
+        embed_out = embed(dummy_tensor)
+        self.assertEqual(embed_out.shape, [4, 3136, 768])
+
+    #@unittest.skip('skip for debug')
+    def test_mlp(self):
+        mlp_op = Mlp(768, 256, 0.0)
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+    
+        out = mlp_op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+
+    #@unittest.skip('skip for debug')
+    def test_identity(self):
+        op = Identity()
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+    
+        out = op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+    
+    #@unittest.skip('skip for debug')
+    def test_mixer_block(self):
+        op = ResBlock(dim=768, seq_len=50)
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+    
+        out = op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+
+    def test_affine(self):
+        op = Affine(dim=768)
+        dummy_tensor = paddle.ones([4, 50, 768])
+
+        dummy_alpha = paddle.ones([1, 1, 768]) * 0.5 
+        dummy_beta = paddle.ones([1, 1, 768]) * 0.2 
+
+        op.alpha.set_value(dummy_alpha)
+        op.beta.set_value(dummy_beta)
+    
+        out = op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+        self.assertEqual(paddle.equal_all(out, paddle.ones([4, 50, 768]) * 0.7).numpy(), True)
diff --git a/image_classification/ResMLP/utils.py b/image_classification/ResMLP/utils.py
new file mode 100644
index 00000000..44800527
--- /dev/null
+++ b/image_classification/ResMLP/utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/image_classification/Shuffle_Transformer/.config.py.swp b/image_classification/Shuffle_Transformer/.config.py.swp
new file mode 100644
index 00000000..e144ebf1
Binary files /dev/null and b/image_classification/Shuffle_Transformer/.config.py.swp differ
diff --git a/image_classification/Shuffle_Transformer/README.md b/image_classification/Shuffle_Transformer/README.md
new file mode 100644
index 00000000..302b7483
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/README.md
@@ -0,0 +1,166 @@
+# Shuffle Transformer: Rethinking Spatial Shuffle for Vision Transformer, [arxiv](https://arxiv.org/abs/2106.03650) 
+
+PaddlePaddle training/validation code and pretrained models for **Shuffle Transformer**.
+
+The official pytorch implementation is [here](https://github.com/mulinmeng/Shuffle-Transformer).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./shuffle1.png" alt="drawing" width="70%" height="80%"/>
+<img src="./shuffle2.png" alt="drawing" width="70%" height="80%"/>
+<h4 align="center"> Shuffle Transformer Model Overview</h4>
+</p>
+
+### Update 
+Update (2021-08-11): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| shuffle_vit_tiny_patch4_window7| 82.39  | 96.05 | 224        | 0.875      | bicubic       | [google](https://drive.google.com/file/d/1ffJ-tG_CGVXztPEPQMaT_lUoc4hxFy__/view?usp=sharing)/[baidu](https://pan.baidu.com/s/19DhlLIFyPGOWtyq_c83ZGQ)(8a1i) |
+| shuffle_vit_small_patch4_window7| 83.53 | 96.57 | 224        | 0.875      | bicubic       | [google](https://drive.google.com/file/d/1du9H0SKr0QH9GQjhWDOXOnhpSVpfbb8X/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1rM2J8BVwxQ3kRZoHngwNZA)(xwh3) |
+| shuffle_vit_base_patch4_window7| 83.95  | 96.91 | 224        | 0.875      | bicubic       | [google](https://drive.google.com/file/d/1sYh808AyTG3-_qv6nfN6gCmyagsNAE6q/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1fks_IYDdnXdAkCFuYHW_Nw)(1gsr) |
+
+> *The results are evaluated on ImageNet2012 validation set.
+
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./shuffle_vit_base_patch4_window7.pdparams`, to use the `shuffle_vit_base_patch4_window7_224` model in python:
+```python
+from config import get_config
+from shuffle_transformer import build_shuffle_transformer as build_model
+# config files in ./configs/
+config = get_config('./configs/shuffle_vit_base_patch4_window7_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./shuffle_vit_base_patch4_window7_224')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate Shuffle Transformer model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/shuffle_vit_base_patch4_window7_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./shuffle_vit_base_patch4_window7_224'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/shuffle_vit_base_patch4_window7_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./shuffle_vit_base_patch4_window7_224'
+```
+
+</details>
+
+
+## Training
+To train the Shuffle Transformer model on ImageNet2012 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/shuffle_vit_base_patch4_window7_224.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=32 \
+  -data_path='/dataset/imagenet' \
+```
+
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/shuffle_vit_base_patch4_window7_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=32 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@article{huang2021shuffle,
+  title={Shuffle Transformer: Rethinking Spatial Shuffle for Vision Transformer},
+  author={Huang, Zilong and Ben, Youcheng and Luo, Guozhong and Cheng, Pei and Yu, Gang and Fu, Bin},
+  journal={arXiv preprint arXiv:2106.03650},
+  year={2021}
+}
+```
diff --git a/image_classification/Shuffle_Transformer/config.py b/image_classification/Shuffle_Transformer/config.py
new file mode 100644
index 00000000..ab6f07bf
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/config.py
@@ -0,0 +1,166 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+
+"""
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 8 #1024 batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #1024 batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size
+_C.DATA.CROP_PCT = 0.9 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'ShuffleTransformer'
+_C.MODEL.NAME = 'ShuffleTransformer'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.0
+_C.MODEL.ATTENTION_DROPOUT = 0.0
+_C.MODEL.DROP_PATH = 0.1
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.PATCH_SIZE = 4 # image_size = patch_size x window_size x num_windows
+_C.MODEL.TRANS.WINDOW_SIZE = 7
+_C.MODEL.TRANS.IN_CHANNELS = 3
+_C.MODEL.TRANS.EMBED_DIM = 96 # same as HIDDEN_SIZE in ViT
+_C.MODEL.TRANS.DEPTHS = [2, 2, 6, 2]
+_C.MODEL.TRANS.NUM_HEADS = [3, 6, 12, 24]
+_C.MODEL.TRANS.MLP_RATIO = 4.
+_C.MODEL.TRANS.QKV_BIAS = True
+_C.MODEL.TRANS.QK_SCALE = None
+_C.MODEL.TRANS.APE = False # absolute positional embeddings
+_C.MODEL.TRANS.PATCH_NORM = True
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 20
+_C.TRAIN.WEIGHT_DECAY = 0.05
+_C.TRAIN.BASE_LR = 0.001
+_C.TRAIN.WARMUP_START_LR = 0.0
+_C.TRAIN.END_LR = 0.0
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'SGD'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# augmentation
+_C.AUG = CN()
+_C.AUG.COLOR_JITTER = 0.4 # color jitter factor
+_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1'
+_C.AUG.RE_PROB = 0.25 # random earse prob
+_C.AUG.RE_MODE = 'pixel' # random earse mode
+_C.AUG.RE_COUNT = 1 # random earse count
+_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0
+_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0
+_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha
+_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled
+_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled
+_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem'
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 20 # freq to save chpt
+_C.REPORT_FREQ = 50 # freq to logging info
+_C.VALIDATE_FREQ = 20 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/Shuffle_Transformer/configs/shuffle_vit_base_patch4_window7_224.yaml b/image_classification/Shuffle_Transformer/configs/shuffle_vit_base_patch4_window7_224.yaml
new file mode 100644
index 00000000..1e9a8c33
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/configs/shuffle_vit_base_patch4_window7_224.yaml
@@ -0,0 +1,12 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: shuffle
+    NAME: shuffle_vit_tiny_patch4_window7_224
+    DROP_PATH: 0.5
+    TRANS:
+        EMBED_DIM: 128
+        DEPTHS: [2, 2, 18, 2]
+        NUM_HEADS: [4, 8, 16, 32]
+        WINDOW_SIZE: 7
diff --git a/image_classification/Shuffle_Transformer/configs/shuffle_vit_small_patch4_window7_224.yaml b/image_classification/Shuffle_Transformer/configs/shuffle_vit_small_patch4_window7_224.yaml
new file mode 100644
index 00000000..4d8f8959
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/configs/shuffle_vit_small_patch4_window7_224.yaml
@@ -0,0 +1,12 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: shuffle
+    NAME: shuffle_vit_tiny_patch4_window7_224
+    DROP_PATH: 0.3
+    TRANS:
+        EMBED_DIM: 96
+        DEPTHS: [2, 2, 18, 2]
+        NUM_HEADS: [3, 6, 12, 24]
+        WINDOW_SIZE: 7
diff --git a/image_classification/Shuffle_Transformer/configs/shuffle_vit_tiny_patch4_window7_224.yaml b/image_classification/Shuffle_Transformer/configs/shuffle_vit_tiny_patch4_window7_224.yaml
new file mode 100644
index 00000000..e6315280
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/configs/shuffle_vit_tiny_patch4_window7_224.yaml
@@ -0,0 +1,12 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: shuffle
+    NAME: shuffle_vit_tiny_patch4_window7_224
+    DROP_PATH: 0.1
+    TRANS:
+        EMBED_DIM: 96
+        DEPTHS: [2, 2, 6, 2]
+        NUM_HEADS: [3, 6, 12, 24]
+        WINDOW_SIZE: 7
diff --git a/image_classification/Shuffle_Transformer/datasets.py b/image_classification/Shuffle_Transformer/datasets.py
new file mode 100644
index 00000000..78a3db09
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/datasets.py
@@ -0,0 +1,188 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, 'bicubic'),
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/Shuffle_Transformer/droppath.py b/image_classification/Shuffle_Transformer/droppath.py
new file mode 100644
index 00000000..d7ecf00c
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/droppath.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def drop_path(self, inputs):
+        """drop path op
+        Args:
+            input: tensor with arbitrary shape
+            drop_prob: float number of drop path probability, default: 0.0
+            training: bool, if current mode is training, default: False
+        Returns:
+            output: output tensor after drop path
+        """
+        # if prob is 0 or eval mode, return original input
+        if self.drop_prob == 0. or not self.training:
+            return inputs
+        keep_prob = 1 - self.drop_prob
+        keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+        random_tensor = random_tensor.floor() # mask
+        output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation
+        return output
+
+    def forward(self, inputs):
+        return self.drop_path(inputs)
+
+
+#def main():
+#    tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32')
+#    dp = DropPath(0.5)
+#    out = dp(tmp)
+#    print(out)
+#
+#if __name__ == "__main__":
+#    main()
diff --git a/image_classification/Shuffle_Transformer/main_multi_gpu.py b/image_classification/Shuffle_Transformer/main_multi_gpu.py
new file mode 100644
index 00000000..4dbe0ccb
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/main_multi_gpu.py
@@ -0,0 +1,362 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shuffle Transformer training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader, get_dataset
+from shuffle_transformer import build_shuffle_transformer as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('Shuffle Transformer')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/Shuffle_Transformer/main_single_gpu.py b/image_classification/Shuffle_Transformer/main_single_gpu.py
new file mode 100644
index 00000000..bc77ef27
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/main_single_gpu.py
@@ -0,0 +1,333 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shuffle Transformer training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from shuffle_transformer import build_shuffle_transformer as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('Shuffle Transformer')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+    # 1. Create model
+    model = build_model(config)
+    #model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  )
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_base.py b/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_base.py
new file mode 100644
index 00000000..0356f9ff
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_base.py
@@ -0,0 +1,211 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from shuffle_transformer import *
+from shuffle_pth.shuffle_transformer_torch import ShuffleTransformer as ShuffleTransformerTorch
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/shuffle_vit_base_patch4_window7_224.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    # (torch_param_name, paddle_param_name)
+    mapping = [
+        ('to_token.conv1.0', 'patch_embedding.conv1.0'), # conv
+        ('to_token.conv1.1', 'patch_embedding.conv1.1'), # bn
+        ('to_token.conv2.0', 'patch_embedding.conv2.0'), # conv
+        ('to_token.conv2.1', 'patch_embedding.conv2.1'), # bn
+        ('to_token.conv3', 'patch_embedding.conv3'), # conv
+    ]
+
+    for stage_idx, num_layers in enumerate(config.MODEL.TRANS.DEPTHS):
+        for idx in range(num_layers):
+            th_layer_idx_0 = idx // 2
+            th_layer_idx_1 = idx % 2
+            th_prefix = f'stage{stage_idx+1}.layers.{th_layer_idx_0}.{th_layer_idx_1}'
+            pp_prefix = f'stages.{stage_idx}.layers.{idx}'
+            layer_mapping = [
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'), #bn
+                (f'{th_prefix}.attn.relative_position_bias_table', f'{pp_prefix}.attn.relative_position_bias_table'), # no transpose
+                (f'{th_prefix}.attn.relative_position_index', f'{pp_prefix}.attn.relative_position_index'), # no transpose
+                (f'{th_prefix}.attn.to_qkv', f'{pp_prefix}.attn.qkv'),
+                (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+                (f'{th_prefix}.local', f'{pp_prefix}.local'),
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'), #bn
+                (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+                (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+                (f'{th_prefix}.norm3', f'{pp_prefix}.norm3'), #bn
+            ]
+            mapping.extend(layer_mapping)
+
+            if stage_idx > 0:
+                layer_mapping = [
+                    (f'stage{stage_idx+1}.patch_partition.norm', f'stages.{stage_idx}.patch_partition.norm'), #bn
+                    (f'stage{stage_idx+1}.patch_partition.reduction', f'stages.{stage_idx}.patch_partition.reduction'),
+                ]
+                mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if 'relative_position' in th_name:
+                _set_value(th_name, pd_name, transpose=False)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+            if f'{th_name}.running_mean' in th_params.keys():
+                th_name_b = f'{th_name}.running_mean'
+                pd_name_b = f'{pd_name}._mean'
+                _set_value(th_name_b, pd_name_b)
+
+            if f'{th_name}.running_var' in th_params.keys():
+                th_name_b = f'{th_name}.running_var'
+                pd_name_b = f'{pd_name}._variance'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_shuffle_transformer(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = ShuffleTransformerTorch(layers=[2, 2, 18, 2],
+                                          num_heads=[4, 8, 16, 32],
+                                          qkv_bias=True,
+                                          embed_dim=128,
+                                          )
+    model_state_dict = torch.load('./shuffle_pth/shuffle_vit_base_patch4_window7_224_ep296.pth', map_location='cpu') 
+    torch_model.load_state_dict(model_state_dict['model'])
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./shuffle_vit_base_patch4_window7_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_base_224.py b/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_base_224.py
new file mode 100644
index 00000000..512f2fb1
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_base_224.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from cswin import *
+from cswin_pytorch.CSWin_Transformer.models import cswin as pytorch_cswin
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/cswin_base_224.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    print('-----------------MODEL NAMED PARAMETERS------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def print_model_named_buffers(model):
+    print('-----------------MODEL NAMED BUFFERS------------------------')
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stage1_conv_embed.0', 'patch_embedding.patch_embed'),
+        ('stage1_conv_embed.2', 'patch_embedding.norm'),
+    ]
+
+    for stage_idx, stage_depth in enumerate(config.MODEL.TRANS.DEPTHS):
+        for idx in range(stage_depth):
+            th_prefix = f'stage{stage_idx+1}.{idx}'
+            pp_prefix = f'stages.{stage_idx}.blocks.{idx}'
+
+            layer_mapping = [
+                (f'{th_prefix}.qkv', f'{pp_prefix}.qkv'),
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+                (f'{th_prefix}.attns.0.get_v', f'{pp_prefix}.attns.0.get_v'),
+                (f'{th_prefix}.attns.1.get_v', f'{pp_prefix}.attns.1.get_v'), # may not exist, ok
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+                (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            ]
+            mapping.extend(layer_mapping)
+        # prefix for last stage may not exist, it is ok in this code
+        th_prefix = f'merge{stage_idx+1}'
+        pp_prefix = f'stages.{stage_idx}.merge'
+        layer_mapping = [
+            (f'{th_prefix}.conv', f'{pp_prefix}.conv'),
+            (f'{th_prefix}.norm', f'{pp_prefix}.norm'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            if f'{th_name}.weight' in th_params.keys():
+                th_name_w = f'{th_name}.weight'
+                pd_name_w = f'{pd_name}.weight'
+                _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_cswin(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = pytorch_cswin.CSWin_96_24322_base_224()
+    model_state_dict = torch.load('./cswin_pytorch/cswin_base_224.pth')
+    torch_model.load_state_dict(model_state_dict['state_dict_ema'])
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./cswin_base_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_base_384.py b/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_base_384.py
new file mode 100644
index 00000000..93b113ad
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_base_384.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from cswin import *
+from cswin_pytorch.CSWin_Transformer.models import cswin as pytorch_cswin
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/cswin_base_384.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    print('-----------------MODEL NAMED PARAMETERS------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def print_model_named_buffers(model):
+    print('-----------------MODEL NAMED BUFFERS------------------------')
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stage1_conv_embed.0', 'patch_embedding.patch_embed'),
+        ('stage1_conv_embed.2', 'patch_embedding.norm'),
+    ]
+
+    for stage_idx, stage_depth in enumerate(config.MODEL.TRANS.DEPTHS):
+        for idx in range(stage_depth):
+            th_prefix = f'stage{stage_idx+1}.{idx}'
+            pp_prefix = f'stages.{stage_idx}.blocks.{idx}'
+
+            layer_mapping = [
+                (f'{th_prefix}.qkv', f'{pp_prefix}.qkv'),
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+                (f'{th_prefix}.attns.0.get_v', f'{pp_prefix}.attns.0.get_v'),
+                (f'{th_prefix}.attns.1.get_v', f'{pp_prefix}.attns.1.get_v'), # may not exist, ok
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+                (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            ]
+            mapping.extend(layer_mapping)
+        # prefix for last stage may not exist, it is ok in this code
+        th_prefix = f'merge{stage_idx+1}'
+        pp_prefix = f'stages.{stage_idx}.merge'
+        layer_mapping = [
+            (f'{th_prefix}.conv', f'{pp_prefix}.conv'),
+            (f'{th_prefix}.norm', f'{pp_prefix}.norm'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            if f'{th_name}.weight' in th_params.keys():
+                th_name_w = f'{th_name}.weight'
+                pd_name_w = f'{pd_name}.weight'
+                _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_cswin(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = pytorch_cswin.CSWin_96_24322_base_384(img_size=384)
+    model_state_dict = torch.load('./cswin_pytorch/cswin_base_384.pth')
+    torch_model.load_state_dict(model_state_dict['state_dict_ema'])
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 384, 384).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./cswin_base_384.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_small.py b/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_small.py
new file mode 100644
index 00000000..e4db4b0f
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_small.py
@@ -0,0 +1,211 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from shuffle_transformer import *
+from shuffle_pth.shuffle_transformer_torch import ShuffleTransformer as ShuffleTransformerTorch
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/shuffle_vit_small_patch4_window7_224.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    # (torch_param_name, paddle_param_name)
+    mapping = [
+        ('to_token.conv1.0', 'patch_embedding.conv1.0'), # conv
+        ('to_token.conv1.1', 'patch_embedding.conv1.1'), # bn
+        ('to_token.conv2.0', 'patch_embedding.conv2.0'), # conv
+        ('to_token.conv2.1', 'patch_embedding.conv2.1'), # bn
+        ('to_token.conv3', 'patch_embedding.conv3'), # conv
+    ]
+
+    for stage_idx, num_layers in enumerate(config.MODEL.TRANS.DEPTHS):
+        for idx in range(num_layers):
+            th_layer_idx_0 = idx // 2
+            th_layer_idx_1 = idx % 2
+            th_prefix = f'stage{stage_idx+1}.layers.{th_layer_idx_0}.{th_layer_idx_1}'
+            pp_prefix = f'stages.{stage_idx}.layers.{idx}'
+            layer_mapping = [
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'), #bn
+                (f'{th_prefix}.attn.relative_position_bias_table', f'{pp_prefix}.attn.relative_position_bias_table'), # no transpose
+                (f'{th_prefix}.attn.relative_position_index', f'{pp_prefix}.attn.relative_position_index'), # no transpose
+                (f'{th_prefix}.attn.to_qkv', f'{pp_prefix}.attn.qkv'),
+                (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+                (f'{th_prefix}.local', f'{pp_prefix}.local'),
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'), #bn
+                (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+                (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+                (f'{th_prefix}.norm3', f'{pp_prefix}.norm3'), #bn
+            ]
+            mapping.extend(layer_mapping)
+
+            if stage_idx > 0:
+                layer_mapping = [
+                    (f'stage{stage_idx+1}.patch_partition.norm', f'stages.{stage_idx}.patch_partition.norm'), #bn
+                    (f'stage{stage_idx+1}.patch_partition.reduction', f'stages.{stage_idx}.patch_partition.reduction'),
+                ]
+                mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if 'relative_position' in th_name:
+                _set_value(th_name, pd_name, transpose=False)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+            if f'{th_name}.running_mean' in th_params.keys():
+                th_name_b = f'{th_name}.running_mean'
+                pd_name_b = f'{pd_name}._mean'
+                _set_value(th_name_b, pd_name_b)
+
+            if f'{th_name}.running_var' in th_params.keys():
+                th_name_b = f'{th_name}.running_var'
+                pd_name_b = f'{pd_name}._variance'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_shuffle_transformer(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = ShuffleTransformerTorch(layers=[2, 2, 18, 2],
+                                          num_heads=[3, 6, 12, 24],
+                                          qkv_bias=True,
+                                          embed_dim=96,
+                                          )
+    model_state_dict = torch.load('./shuffle_pth/shuffle_vit_small_patch4_window7_224_ep292.pth', map_location='cpu') 
+    torch_model.load_state_dict(model_state_dict['model'])
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./shuffle_vit_small_patch4_window7_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_tiny.py b/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_tiny.py
new file mode 100644
index 00000000..bcefaaed
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/port_weights/load_pytorch_weights_tiny.py
@@ -0,0 +1,211 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from shuffle_transformer import *
+from shuffle_pth.shuffle_transformer_torch import ShuffleTransformer as ShuffleTransformerTorch
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/shuffle_vit_tiny_patch4_window7_224.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    # (torch_param_name, paddle_param_name)
+    mapping = [
+        ('to_token.conv1.0', 'patch_embedding.conv1.0'), # conv
+        ('to_token.conv1.1', 'patch_embedding.conv1.1'), # bn
+        ('to_token.conv2.0', 'patch_embedding.conv2.0'), # conv
+        ('to_token.conv2.1', 'patch_embedding.conv2.1'), # bn
+        ('to_token.conv3', 'patch_embedding.conv3'), # conv
+    ]
+
+    for stage_idx, num_layers in enumerate(config.MODEL.TRANS.DEPTHS):
+        for idx in range(num_layers):
+            th_layer_idx_0 = idx // 2
+            th_layer_idx_1 = idx % 2
+            th_prefix = f'stage{stage_idx+1}.layers.{th_layer_idx_0}.{th_layer_idx_1}'
+            pp_prefix = f'stages.{stage_idx}.layers.{idx}'
+            layer_mapping = [
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'), #bn
+                (f'{th_prefix}.attn.relative_position_bias_table', f'{pp_prefix}.attn.relative_position_bias_table'), # no transpose
+                (f'{th_prefix}.attn.relative_position_index', f'{pp_prefix}.attn.relative_position_index'), # no transpose
+                (f'{th_prefix}.attn.to_qkv', f'{pp_prefix}.attn.qkv'),
+                (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+                (f'{th_prefix}.local', f'{pp_prefix}.local'),
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'), #bn
+                (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+                (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+                (f'{th_prefix}.norm3', f'{pp_prefix}.norm3'), #bn
+            ]
+            mapping.extend(layer_mapping)
+
+            if stage_idx > 0:
+                layer_mapping = [
+                    (f'stage{stage_idx+1}.patch_partition.norm', f'stages.{stage_idx}.patch_partition.norm'), #bn
+                    (f'stage{stage_idx+1}.patch_partition.reduction', f'stages.{stage_idx}.patch_partition.reduction'),
+                ]
+                mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if 'relative_position' in th_name:
+                _set_value(th_name, pd_name, transpose=False)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+            if f'{th_name}.running_mean' in th_params.keys():
+                th_name_b = f'{th_name}.running_mean'
+                pd_name_b = f'{pd_name}._mean'
+                _set_value(th_name_b, pd_name_b)
+
+            if f'{th_name}.running_var' in th_params.keys():
+                th_name_b = f'{th_name}.running_var'
+                pd_name_b = f'{pd_name}._variance'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_shuffle_transformer(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = ShuffleTransformerTorch(layers=[2, 2, 6, 2],
+                                          num_heads=[3, 6, 12, 24],
+                                          qkv_bias=True,
+                                          embed_dim=96,
+                                          )
+    model_state_dict = torch.load('./shuffle_pth/shuffle_vit_tiny_patch4_window7_224_ep298.pth', map_location='cpu') 
+    torch_model.load_state_dict(model_state_dict['model'])
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./shuffle_vit_tiny_patch4_window7_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/Shuffle_Transformer/run_eval.sh b/image_classification/Shuffle_Transformer/run_eval.sh
new file mode 100644
index 00000000..7d894422
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/run_eval.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/shuffle_vit_tiny_patch4_window7_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./shuffle_vit_tiny_patch4_window7_224'
diff --git a/image_classification/Shuffle_Transformer/run_eval_multi.sh b/image_classification/Shuffle_Transformer/run_eval_multi.sh
new file mode 100644
index 00000000..b8084e7e
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/run_eval_multi.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/shuffle_vit_tiny_patch4_window7_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./shuffle_vit_tiny_patch4_window7_224' \
diff --git a/image_classification/Shuffle_Transformer/run_eval_multi_base.sh b/image_classification/Shuffle_Transformer/run_eval_multi_base.sh
new file mode 100644
index 00000000..10356fc5
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/run_eval_multi_base.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/shuffle_vit_base_patch4_window7_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./shuffle_vit_base_patch4_window7_224' \
diff --git a/image_classification/Shuffle_Transformer/run_eval_multi_small.sh b/image_classification/Shuffle_Transformer/run_eval_multi_small.sh
new file mode 100644
index 00000000..07011f90
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/run_eval_multi_small.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/shuffle_vit_small_patch4_window7_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./shuffle_vit_small_patch4_window7_224' \
diff --git a/image_classification/Shuffle_Transformer/run_train.sh b/image_classification/Shuffle_Transformer/run_train.sh
new file mode 100644
index 00000000..8c2484d8
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/shuffle_vit_tiny_patch4_window7_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=64 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/Shuffle_Transformer/run_train_multi.sh b/image_classification/Shuffle_Transformer/run_train_multi.sh
new file mode 100644
index 00000000..eaaa7a61
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/run_train_multi.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/shuffle_vit_tiny_patch4_window7_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/Shuffle_Transformer/shuffle1.png b/image_classification/Shuffle_Transformer/shuffle1.png
new file mode 100644
index 00000000..9674ae53
Binary files /dev/null and b/image_classification/Shuffle_Transformer/shuffle1.png differ
diff --git a/image_classification/Shuffle_Transformer/shuffle2.png b/image_classification/Shuffle_Transformer/shuffle2.png
new file mode 100644
index 00000000..a70eaa76
Binary files /dev/null and b/image_classification/Shuffle_Transformer/shuffle2.png differ
diff --git a/image_classification/Shuffle_Transformer/shuffle_transformer.py b/image_classification/Shuffle_Transformer/shuffle_transformer.py
new file mode 100644
index 00000000..dc419852
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/shuffle_transformer.py
@@ -0,0 +1,509 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Implement Shuffle Transformer (https://arxiv.org/abs/2106.03650) """
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+
+    The output of this layer is the input without any change.
+    Use this layer to avoid if condition in some forward methods
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch embedding layer
+
+    Apply patch embeddings on input images. Embeddings in implemented using
+    2 stacked Conv2D layers.
+
+    Attriubutes:
+        image_size: int, input image size, default: 224
+        patch_size: int, size of an image patch, default: 4
+        in_channels: int, input image channels, default: 3
+        inter_dim: int, intermediate dim for conv layers, default: 32
+        embed_dim: int, embedding dimension, default: 48
+    """
+    def __init__(self,
+                 image_size=224,
+                 inter_dim=32,
+                 embed_dim=48,
+                 in_channels=3):
+        super().__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2D(in_channels, inter_dim, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2D(inter_dim),
+            nn.ReLU6())
+
+        self.conv2 = nn.Sequential(
+            nn.Conv2D(inter_dim, embed_dim, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2D(embed_dim),
+            nn.ReLU6())
+
+        self.conv3 = nn.Conv2D(embed_dim, embed_dim, kernel_size=1, stride=1, padding=0)
+
+        # 4 = stride * stride
+        self.num_patches = (image_size // 4) * (image_size // 4)
+
+    def forward(self, inputs):
+        out = self.conv1(inputs)
+        out = self.conv2(out)
+        out = self.conv3(out)
+        return out
+
+
+class MLP(nn.Layer):
+    """MLP module
+
+    A MLP layer which uses 1x1 conv instead of linear layers.
+    ReLU6 is used as activation function.
+
+    Args:
+        in_features: int, input feature dim.
+        hidden_features: int, hidden feature dim.
+        out_features: int, output feature dim.
+        dropout: flaot, dropout rate, default: 0.0.
+    """
+
+    def __init__(self, in_features, hidden_features=None, out_features=None, dropout=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2D(in_features, hidden_features, 1, 1, 0)
+        self.act = nn.ReLU6()
+        self.fc2 = nn.Conv2D(hidden_features, out_features, 1, 1, 0)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, inputs):
+        out = self.fc1(inputs) # [batch_size, hidden_dim, height, width]
+        out = self.act(out)
+        out = self.dropout(out)
+        out = self.fc2(out)
+        out = self.dropout(out)
+        return out
+
+
+class WindowAttention(nn.Layer):
+    """ Window Multihead Aelf-attention Module.
+    This module use 1x1 Conv as the qkv proj and linear proj
+    Args:
+        dim: int, input dimension.
+        num_heads: int, number of attention heads.
+        windows_size: int, the window size of attention modules, default: 1
+        shuffle: bool, if True, use output shuffle, default: False
+        qk_scale: float, if set, override default qk scale, default: None
+        qkv_bias: bool, if True, enable bias to qkv, default: False
+        dropout: float, dropout for output, default: 0.
+        attention_dropout: float, dropout of attention, default: 0.
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=1,
+                 shuffle=False,
+                 qk_scale=None,
+                 qkv_bias=False,
+                 dropout=0.,
+                 attention_dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // self.num_heads
+        self.window_size = window_size
+        self.shuffle = shuffle
+        self.scale = qk_scale or self.head_dim ** -0.5
+
+        self.qkv = nn.Conv2D(dim, dim * 3, kernel_size=1, bias_attr=qkv_bias)
+        self.attention_dropout = nn.Dropout(attention_dropout)
+        self.proj = nn.Conv2D(dim, dim, kernel_size=1)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+        self.relative_position_bias_table = paddle.create_parameter(
+            shape=[(2 * window_size - 1) * (2 * window_size - 1), num_heads],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))
+
+        # relative position index for each token inside window
+        coords_h = paddle.arange(0, self.window_size)
+        coords_w = paddle.arange(0, self.window_size)
+        coords = paddle.stack(paddle.meshgrid([coords_h, coords_w]))
+        coords_flatten = paddle.flatten(coords, 1) # [2, window_h * window_w]
+        # 2, window_h * window_w, window_h * window_h
+        relative_coords = coords_flatten.unsqueeze(2) - coords_flatten.unsqueeze(1)
+        # winwod_h*window_w, window_h*window_w, 2
+        relative_coords = relative_coords.transpose([1, 2, 0])
+        relative_coords[:, :, 0] += self.window_size - 1
+        relative_coords[:, :, 1] += self.window_size - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size - 1
+        # [window_size * window_size, window_size*window_size]
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+
+    def get_relative_pos_bias_from_pos_index(self):
+        # relative_position_bias_table is a ParamBase object
+        table = self.relative_position_bias_table # N x num_heads
+        # index is a tensor
+        index = self.relative_position_index.reshape([-1])
+        # window_h*window_w * window_h*window_w
+        relative_position_bias = paddle.index_select(x=table, index=index)
+        return relative_position_bias
+
+    def transpose_multihead(self, x):
+        B, C, H, W = x.shape
+        n_window = H // self.window_size
+        if self.shuffle:
+            x = x.reshape([B,
+                           self.num_heads,
+                           self.head_dim,
+                           self.window_size, # window_size first
+                           n_window,
+                           self.window_size,
+                           n_window])
+            x = x.transpose([0, 4, 6, 1, 3, 5, 2]) # order matters
+        else:
+            x = x.reshape([B,
+                           self.num_heads,
+                           self.head_dim,
+                           n_window, # n_window first
+                           self.window_size,
+                           n_window,
+                           self.window_size])
+            x = x.transpose([0, 3, 5, 1, 4, 6, 2]) # order metters
+
+        x = x.reshape([B * n_window * n_window,
+                       self.num_heads,
+                       self.window_size * self.window_size,
+                       self.head_dim])
+        return x
+
+    def transpose_multihead_reverse(self, x, B, H, W):
+        assert H == W
+        n_window = H // self.window_size
+        x = x.reshape([B,
+                       n_window,
+                       n_window,
+                       self.num_heads,
+                       self.window_size,
+                       self.window_size,
+                       self.head_dim])
+        if self.shuffle:
+            x = x.transpose([0, 3, 6, 4, 1, 5, 2])
+        else:
+            x = x.transpose([0, 3, 6, 1, 4, 2, 5])
+        x = x.reshape([B,
+                       self.num_heads * self.head_dim,
+                       self.window_size * n_window,
+                       self.window_size * n_window])
+        return x
+
+    def forward(self, inputs):
+        B, C, H, W = inputs.shape
+        qkv = self.qkv(inputs).chunk(3, axis=1) # qkv is a tuple: (q, k, v)
+
+        # Now q, k, and v has the following shape:
+        # Case1: [B, (num_heads * head_dim), (window_size * n_window), (window_size * n_window)]
+        # Case2: [B, (num_heads * head_dim), (n_window * window_size), (n_window * window_size)]
+        # where Case 1 is used when shuffle is True, Case 2 is used for no shuffle
+
+        # with/without spatial shuffle
+        # shape = [(B * n_window * n_window), num_heads, (window_size * window_size), head_dim]
+        q, k, v = map(self.transpose_multihead, qkv)
+
+        q = q * self.scale
+        attn = paddle.matmul(q, k, transpose_y=True)
+
+        relative_position_bias = self.get_relative_pos_bias_from_pos_index()
+
+        relative_position_bias = relative_position_bias.reshape(
+            [self.window_size * self.window_size,
+             self.window_size * self.window_size,
+             -1])
+        # nH, window_h * window_w, window_h * window_h
+        relative_position_bias = paddle.transpose(relative_position_bias, perm=[2, 0, 1])
+
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        attn = self.softmax(attn)
+        z = paddle.matmul(attn, v)
+
+
+        # shape = [(B * n_window * n_window), num_heads, (window_size * window_size), head_dim]
+        # new shape=[B, (num_heads * head_dim), (n_window * window_size), (n_window * window_size)]
+        z = self.transpose_multihead_reverse(z, B, H, W)
+
+        z = self.proj(z)
+        z = self.proj_dropout(z)
+
+        return z
+
+
+class ShuffleBlock(nn.Layer):
+    """Shuffle block layers
+
+    Shuffle block layers contains multi head attention, conv,
+    droppath, mlp, batch_norm and residual.
+
+    Attributes:
+        dim: int, embedding dimension
+        out_dim: int, stage output dim
+        num_heads: int, num of attention heads
+        window_size: int, window size, default: 1
+        shuffle: bool, if True, apply channel shuffle, default: False
+        mlp_ratio: float, ratio of mlp hidden dim and input dim, default: 4.
+        qk_scale: float, if set, override default qk scale, default: None
+        qkv_bias: bool, if True, enable bias to qkv, default: False
+        dropout: float, dropout for output, default: 0.
+        attention_dropout: float, dropout of attention, default: 0.
+        droppath: float, drop path rate, default: 0.
+    """
+    def __init__(self,
+                 dim,
+                 out_dim,
+                 num_heads,
+                 window_size=1,
+                 shuffle=False,
+                 mlp_ratio=4,
+                 qk_scale=None,
+                 qkv_bias=False,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        self.norm1 = nn.BatchNorm2D(dim)
+        self.attn = WindowAttention(dim,
+                                    num_heads=num_heads,
+                                    window_size=window_size,
+                                    shuffle=shuffle,
+                                    qk_scale=qk_scale,
+                                    qkv_bias=qkv_bias,
+                                    dropout=dropout,
+                                    attention_dropout=attention_dropout)
+        # neighbor-window connection enhancement (NWC)
+        self.local = nn.Conv2D(dim,
+                               dim,
+                               kernel_size=window_size,
+                               stride=1,
+                               padding=window_size // 2,
+                               groups=dim)
+        self.drop_path = DropPath(droppath)
+        self.norm2 = nn.BatchNorm2D(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MLP(dim, mlp_hidden_dim, out_dim, dropout)
+        self.norm3 = nn.BatchNorm2D(dim)
+
+    def forward(self, x):
+        # attention
+        h = x
+        x = self.norm1(x)
+        x = self.attn(x)
+        x = self.drop_path(x)
+        x = h + x
+        # neighbor-window connection enhancement (NWC)
+        h = x
+        x = self.norm2(x)
+        x = self.local(x)
+        x = h + x
+        # mlp
+        h = x
+        x = self.norm3(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = h + x
+        return x
+
+
+class PatchMerging(nn.Layer):
+    """Patch Merging
+    Merge the patches by a BatchNorm and a Conv2D with kernel size 2x2
+    and stride 2, to reduce the number of tokens
+    """
+    def __init__(self, in_dim=32, out_dim=64):
+        super().__init__()
+        self.norm = nn.BatchNorm2D(in_dim)
+        self.reduction = nn.Conv2D(in_dim,
+                                   out_dim,
+                                   kernel_size=2,
+                                   stride=2,
+                                   padding=0,
+                                   bias_attr=False)
+
+    def forward(self, inputs):
+        out = self.norm(inputs)
+        out = self.reduction(out)
+        return out
+
+
+class StageModule(nn.Layer):
+    """Stage layer for shuffle transformer
+
+    Stage layers contains a number of Transformer blocks and an optional
+    patch merging layer, patch merging is not applied after last stage
+
+    Attributes:
+        num_layers: int, num of blocks in stage
+        dim: int, embedding dimension
+        out_dim: int, stage output dim
+        num_heads: int, num of attention heads
+        window_size: int, window size, default: 1
+        mlp_ratio: float, ratio of mlp hidden dim and input dim, default: 4.
+        qk_scale: float, if set, override default qk scale, default: None
+        qkv_bias: bool, if True, enable bias to qkv, default: False
+        dropout: float, dropout for output, default: 0.
+        attention_dropout: float, dropout of attention, default: 0.
+        droppath: float, drop path rate, default: 0.
+    """
+    def __init__(self,
+                 num_layers,
+                 dim,
+                 out_dim,
+                 num_heads,
+                 window_size=1,
+                 shuffle=True,
+                 mlp_ratio=4.,
+                 qk_scale=None,
+                 qkv_bias=False,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        assert num_layers % 2 == 0, "Stage layers must be even for shifted block."
+        if dim != out_dim:
+            self.patch_partition = PatchMerging(in_dim=dim, out_dim=out_dim)
+        else:
+            self.patch_partition = Identity()
+
+        self.layers = nn.LayerList()
+        for idx in range(num_layers):
+            shuffle = idx % 2 != 0
+            self.layers.append(ShuffleBlock(dim=out_dim,
+                                            out_dim=out_dim,
+                                            num_heads=num_heads,
+                                            window_size=window_size,
+                                            shuffle=shuffle,
+                                            mlp_ratio=mlp_ratio,
+                                            qk_scale=qk_scale,
+                                            qkv_bias=qkv_bias,
+                                            dropout=dropout,
+                                            attention_dropout=attention_dropout,
+                                            droppath=droppath))
+
+    def forward(self, inputs):
+        out = self.patch_partition(inputs)
+        for layer in self.layers:
+            out = layer(out)
+
+        return out
+
+
+class ShuffleTransformer(nn.Layer):
+    """Shuffle Transformer
+    Args:
+        image_size: int, input image size, default: 224
+        num_classes: int, num of classes, default: 1000
+        token_dim: int, intermediate feature dim in PatchEmbedding, default: 32
+        embed_dim: int, embedding dim (out dim for PatchEmbedding), default: 96
+        mlp_ratio: float, ratio for mlp dim, mlp hidden_dim = mlp in_dim * mlp_ratio, default: 4.
+        layers: list of int, num of layers in each stage, default: [2, 2, 6, 2]
+        num_heads: list of int, num of heads in each stage, default: [3, 6, 12, 24]
+        window_size: int, attention window size, default: 7
+        qk_scale: float, if set, override default qk scale (head_dim**-0.5), default: None
+        qkv_bias: bool, if True, qkv layers is set with bias, default: False
+        attention_dropout: float, dropout rate of attention, default: 0.0
+        dropout: float, dropout rate for output, default: 0.0
+        droppath: float, droppath rate, default: 0.0
+    """
+
+    def __init__(self,
+                 image_size=224,
+                 num_classes=1000,
+                 token_dim=32,
+                 embed_dim=96,
+                 mlp_ratio=4.,
+                 layers=(2, 2, 6, 2),
+                 num_heads=(3, 6, 12, 24),
+                 window_size=7,
+                 qk_scale=None,
+                 qkv_bias=False,
+                 attention_dropout=0.,
+                 dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = embed_dim
+        self.embed_dim = embed_dim
+        dims = [embed_dim]
+        dims.extend([i * 32 for i in num_heads]) # dims for each stage
+
+        self.patch_embedding = PatchEmbedding(image_size=image_size,
+                                              inter_dim=token_dim,
+                                              embed_dim=embed_dim)
+        #num_patches = self.patch_embedding.num_patches
+        self.num_stages = len(layers)
+        dprs = [x.item() for x in np.linspace(0, droppath, self.num_stages)]
+
+        self.stages = nn.LayerList()
+        for i in range(self.num_stages):
+            self.stages.append(StageModule(layers[i],
+                                           dims[i],
+                                           dims[i+1],
+                                           num_heads[i],
+                                           window_size=window_size,
+                                           mlp_ratio=mlp_ratio,
+                                           qk_scale=qk_scale,
+                                           qkv_bias=qkv_bias,
+                                           attention_dropout=attention_dropout,
+                                           dropout=dropout,
+                                           droppath=dprs[i]))
+        self.avgpool = nn.AdaptiveAvgPool2D(1)
+        self.head = nn.Linear(dims[-1], num_classes)
+
+    def forward_features(self, inputs):
+        out = self.patch_embedding(inputs)
+        B, C, H, W = out.shape
+
+        for idx, stage in enumerate(self.stages):
+            out = stage(out)
+
+        out = self.avgpool(out)
+        out = paddle.flatten(out, 1)
+        return out
+
+    def forward(self, inputs):
+        out = self.forward_features(inputs)
+        out = self.head(out)
+        return out
+
+
+def build_shuffle_transformer(config):
+    """ build shuffle transformer using config"""
+    model = ShuffleTransformer(image_size=config.DATA.IMAGE_SIZE,
+                               embed_dim=config.MODEL.TRANS.EMBED_DIM,
+                               mlp_ratio=config.MODEL.TRANS.MLP_RATIO,
+                               layers=config.MODEL.TRANS.DEPTHS,
+                               num_heads=config.MODEL.TRANS.NUM_HEADS,
+                               window_size=config.MODEL.TRANS.WINDOW_SIZE,
+                               qk_scale=config.MODEL.TRANS.QK_SCALE,
+                               qkv_bias=config.MODEL.TRANS.QKV_BIAS)
+    return model
diff --git a/image_classification/Shuffle_Transformer/utils.py b/image_classification/Shuffle_Transformer/utils.py
new file mode 100644
index 00000000..9c5dffa3
--- /dev/null
+++ b/image_classification/Shuffle_Transformer/utils.py
@@ -0,0 +1,147 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for Shuffle ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+import numpy as np
+
+
+class MyPrint():
+    """" Print tensor and its shape, used for debug """
+    def __init__(self):
+        self.cnt = 0
+    def myprint(self, prefix, var, cnt=None, save=None):
+        """print tensor and its shape, optionly save to npy
+        Args:
+            prefix: str, print info in 1st and last lines
+            var: Tensor, tensor needs to print
+            cnt: int, if self.cnt is exceed this value, print will stop
+            save: str, file name (should end with .npy) to save the tensor, if None no save 
+        """
+        if cnt is None or self.cnt < cnt: 
+            print(f'------------ {prefix} ---------------')
+            print(var.shape, var)
+            print(f'------------ END {prefix} ---------------')
+            if save is not None:
+                var = var.numpy()
+                with open(save,'wb') as ofile:
+                    np.save(ofile, var)
+        self.cnt += 1
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.val = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.val = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.val = val
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/image_classification/SwinTransformer/README.md b/image_classification/SwinTransformer/README.md
new file mode 100644
index 00000000..92081ccd
--- /dev/null
+++ b/image_classification/SwinTransformer/README.md
@@ -0,0 +1,164 @@
+# Swin Transformer: Hierarchical Vision Transformer using Shifted Windows, [arxiv](https://arxiv.org/pdf/2103.14030.pdf) 
+
+PaddlePaddle training/validation code and pretrained models for **Swin Transformer**.
+
+The official pytorch implementation is [here](https://github.com/microsoft/Swin-Transformer).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./swin.png" alt="drawing" width="80%" height="80%"/>
+    <h4 align="center">Swin Transformer Model Overview</h4>
+</p>
+
+### Update 
+Update (2021-08-11): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| swin_base_patch4_window7_224   | 85.27 | 97.56 | 224        | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1yjZFJoJeDFIfsxh9x10XGqCb8s2-Gtbp/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1AseY3CKmJvlxoSwXnxHEwA)(wyck) |
+| swin_base_patch4_window12_384  | 86.43 | 98.07 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1ThmGsTDZ8217-Zuo9o5EGLfzw8AI6N0w/view?usp=sharing)/[baidu](https://pan.baidu.com/s/10E3F9jqBeBTcasIvJ8iMzg)(4a95) |
+| swin_large_patch4_window12_384 | 87.14 | 98.23 | 384        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1f30Mt80g5yLfEiViT4-kMLpyDjTUTV5B/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1w5w8QNfg0zY3nSfGs-Tx3A)(j71u) |
+
+> *The results are evaluated on ImageNet2012 validation set.
+
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./swin_base_patch4_window7_224.pdparams`, to use the `swin_base_patch4_window7_224` model in python:
+```python
+from config import get_config
+from swin import build_swin as build_model
+# config files in ./configs/
+config = get_config('./configs/swin_base_patch4_window7_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./swin_base_patch4_window7_224')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate Swin Transformer model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/swin_base_patch4_window7_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./swin_base_patch4_window7_224'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/swin_base_patch4_window7_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./swin_base_patch4_window7_224'
+```
+
+</details>
+
+
+## Training
+To train the Swin Transformer model on ImageNet2012 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_singel_gpu.py \
+  -cfg='./configs/swin_base_patch4_window7_224.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=32 \
+  -data_path='/dataset/imagenet' \
+```
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/swin_base_patch4_window7_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@article{liu2021swin,
+  title={Swin transformer: Hierarchical vision transformer using shifted windows},
+  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
+  journal={arXiv preprint arXiv:2103.14030},
+  year={2021}
+}
+```
diff --git a/image_classification/SwinTransformer/augmentation.py b/image_classification/SwinTransformer/augmentation.py
new file mode 100644
index 00000000..811a0cea
--- /dev/null
+++ b/image_classification/SwinTransformer/augmentation.py
@@ -0,0 +1,3 @@
+import paddle
+import paddle.nn as nn
+
diff --git a/image_classification/SwinTransformer/config.py b/image_classification/SwinTransformer/config.py
new file mode 100644
index 00000000..871d7858
--- /dev/null
+++ b/image_classification/SwinTransformer/config.py
@@ -0,0 +1,167 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+
+"""
+
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 8 #1024 batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #1024 batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size
+_C.DATA.CROP_PCT = 0.9 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'Swin'
+_C.MODEL.NAME = 'Swin'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.0
+_C.MODEL.ATTENTION_DROPOUT = 0.0
+_C.MODEL.DROP_PATH = 0.1
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.PATCH_SIZE = 4 # image_size = patch_size x window_size x num_windows
+_C.MODEL.TRANS.WINDOW_SIZE = 7
+_C.MODEL.TRANS.IN_CHANNELS = 3
+_C.MODEL.TRANS.EMBED_DIM = 96 # same as HIDDEN_SIZE in ViT
+_C.MODEL.TRANS.STAGE_DEPTHS = [2, 2, 6, 2]
+_C.MODEL.TRANS.NUM_HEADS = [3, 6, 12, 24]
+_C.MODEL.TRANS.MLP_RATIO = 4.
+_C.MODEL.TRANS.QKV_BIAS = True
+_C.MODEL.TRANS.QK_SCALE = None
+_C.MODEL.TRANS.APE = False # absolute positional embeddings
+_C.MODEL.TRANS.PATCH_NORM = True
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 20
+_C.TRAIN.WEIGHT_DECAY = 0.05
+_C.TRAIN.BASE_LR = 0.001
+_C.TRAIN.WARMUP_START_LR = 0.0
+_C.TRAIN.END_LR = 0.0
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# augmentation
+_C.AUG = CN()
+_C.AUG.COLOR_JITTER = 0.4 # color jitter factor
+_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1'
+_C.AUG.RE_PROB = 0.25 # random earse prob
+_C.AUG.RE_MODE = 'pixel' # random earse mode
+_C.AUG.RE_COUNT = 1 # random earse count
+_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0
+_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0
+_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha
+_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled
+_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled
+_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem'
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 20 # freq to save chpt
+_C.REPORT_FREQ = 50 # freq to logging info
+_C.VALIDATE_FREQ = 20 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/SwinTransformer/configs/swin_base_patch4_window12_384.yaml b/image_classification/SwinTransformer/configs/swin_base_patch4_window12_384.yaml
new file mode 100644
index 00000000..90b01a6f
--- /dev/null
+++ b/image_classification/SwinTransformer/configs/swin_base_patch4_window12_384.yaml
@@ -0,0 +1,13 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: swin
+    NAME: swin_base_patch4_window12_384
+    DROP_PATH: 0.5
+    TRANS:
+        PATCH_SIZE: 4
+        WINDOW_SIZE: 12
+        EMBED_DIM: 128
+        STAGE_DEPTHS: [2, 2, 18, 2]
+        NUM_HEADS: [4, 8, 16, 32]
diff --git a/image_classification/SwinTransformer/configs/swin_base_patch4_window7_224.yaml b/image_classification/SwinTransformer/configs/swin_base_patch4_window7_224.yaml
new file mode 100644
index 00000000..9a1d075e
--- /dev/null
+++ b/image_classification/SwinTransformer/configs/swin_base_patch4_window7_224.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.90
+MODEL:
+    TYPE: swin
+    NAME: swin_base_patch4_window7_224
+    DROP_PATH: 0.5
+    TRANS:
+        EMBED_DIM: 128
+        STAGE_DEPTHS: [2, 2, 18, 2]
+        NUM_HEADS: [4, 8, 16, 32]
+        WINDOW_SIZE: 7
+        PATCH_SIZE: 4
+
diff --git a/image_classification/SwinTransformer/configs/swin_large_patch4_window12_384.yaml b/image_classification/SwinTransformer/configs/swin_large_patch4_window12_384.yaml
new file mode 100644
index 00000000..cca8c9ee
--- /dev/null
+++ b/image_classification/SwinTransformer/configs/swin_large_patch4_window12_384.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: swin
+    NAME: swin_large_patch4_window12_384
+    DROP_PATH: 0.5
+    TRANS:
+        EMBED_DIM: 192
+        STAGE_DEPTHS: [2, 2, 18, 2]
+        NUM_HEADS: [6, 12, 24, 48]
+        WINDOW_SIZE: 12
+        PATCH_SIZE: 4
+
diff --git a/image_classification/SwinTransformer/datasets.py b/image_classification/SwinTransformer/datasets.py
new file mode 100644
index 00000000..6472a6b5
--- /dev/null
+++ b/image_classification/SwinTransformer/datasets.py
@@ -0,0 +1,186 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, interpolation='bicubic'),
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/SwinTransformer/drop.py b/image_classification/SwinTransformer/drop.py
new file mode 100644
index 00000000..d46205a0
--- /dev/null
+++ b/image_classification/SwinTransformer/drop.py
@@ -0,0 +1,23 @@
+import paddle
+import paddle.nn as nn
+
+def drop_path(x, drop_prob=0., training=False):
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1,)*(x.ndim-1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    #random_tensor.to(x.device)
+    random_tensor = random_tensor.floor()
+    output = x.divide(keep_prob) * random_tensor
+    return output
+    
+
+class DropPath(nn.Layer):
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+        
diff --git a/image_classification/SwinTransformer/droppath.py b/image_classification/SwinTransformer/droppath.py
new file mode 100644
index 00000000..d7ecf00c
--- /dev/null
+++ b/image_classification/SwinTransformer/droppath.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def drop_path(self, inputs):
+        """drop path op
+        Args:
+            input: tensor with arbitrary shape
+            drop_prob: float number of drop path probability, default: 0.0
+            training: bool, if current mode is training, default: False
+        Returns:
+            output: output tensor after drop path
+        """
+        # if prob is 0 or eval mode, return original input
+        if self.drop_prob == 0. or not self.training:
+            return inputs
+        keep_prob = 1 - self.drop_prob
+        keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+        random_tensor = random_tensor.floor() # mask
+        output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation
+        return output
+
+    def forward(self, inputs):
+        return self.drop_path(inputs)
+
+
+#def main():
+#    tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32')
+#    dp = DropPath(0.5)
+#    out = dp(tmp)
+#    print(out)
+#
+#if __name__ == "__main__":
+#    main()
diff --git a/image_classification/SwinTransformer/main_multi_gpu.py b/image_classification/SwinTransformer/main_multi_gpu.py
new file mode 100644
index 00000000..5992f6c7
--- /dev/null
+++ b/image_classification/SwinTransformer/main_multi_gpu.py
@@ -0,0 +1,366 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Swin training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader, get_dataset
+from swin_transformer import build_swin as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from utils import get_exclude_from_weight_decay_fn
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('Swin')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            apply_decay_param_fun=get_exclude_from_weight_decay_fn([
+                'absolute_pos_embed', 'relative_position_bias_table']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/SwinTransformer/main_single_gpu.py b/image_classification/SwinTransformer/main_single_gpu.py
new file mode 100644
index 00000000..5f9d9373
--- /dev/null
+++ b/image_classification/SwinTransformer/main_single_gpu.py
@@ -0,0 +1,337 @@
+
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Swin training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from swin_transformer import build_swin as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from utils import get_exclude_from_weight_decay_fn
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('Swin')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+    # 1. Create model
+    model = build_model(config)
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            apply_decay_param_fun=get_exclude_from_weight_decay_fn([
+                'absolute_pos_embed', 'relative_position_bias_table']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  )
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/SwinTransformer/port_weights/load_pytorch_weights.py b/image_classification/SwinTransformer/port_weights/load_pytorch_weights.py
new file mode 100644
index 00000000..9fec0b3a
--- /dev/null
+++ b/image_classification/SwinTransformer/port_weights/load_pytorch_weights.py
@@ -0,0 +1,185 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from swin_transformer import *
+from config import *
+
+
+config = get_config('./configs/swin_base_patch4_window7_224.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('patch_embed.proj', 'patch_embedding.patch_embed'),
+        ('patch_embed.norm', 'patch_embedding.norm'),
+    ]
+    
+    # torch 'layers' to  paddle 'stages'
+    depths = config.MODEL.TRANS.STAGE_DEPTHS
+    num_stages = len(depths)
+    for stage_idx in range(num_stages):
+        pp_s_prefix = f'stages.{stage_idx}.blocks'
+        th_s_prefix = f'layers.{stage_idx}.blocks'
+        for block_idx in range(depths[stage_idx]):
+            th_b_prefix = f'{th_s_prefix}.{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                (f'{th_b_prefix}.attn.relative_position_bias_table', f'{pp_b_prefix}.attn.relative_position_bias_table'),
+                (f'{th_b_prefix}.attn.qkv', f'{pp_b_prefix}.attn.qkv'),
+                (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+            ]
+            mapping.extend(layer_mapping)
+        # stage downsample: last stage does not have downsample ops
+        if stage_idx < num_stages - 1:
+            mapping.extend([
+                (f'layers.{stage_idx}.downsample.reduction.weight', f'stages.{stage_idx}.downsample.reduction.weight'),
+                (f'layers.{stage_idx}.downsample.norm', f'stages.{stage_idx}.downsample.norm')])
+
+    mapping.extend([
+        ('norm', 'norm'),
+        ('head', 'fc')])
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_swin(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('swin_base_patch4_window7_224', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-4)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./swin_base_patch4_window7_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+
+    #tmp = np.random.randn(1, 56, 128, 128).astype('float32')
+    #xp = paddle.to_tensor(tmp)
+    #xt = torch.Tensor(tmp).to(device)
+    #xps = paddle.roll(xp, shifts=(-3, -3), axis=(1,2))
+    #xts = torch.roll(xt,shifts=(-3, -3), dims=(1,2))
+    #xps = xps.cpu().numpy()
+    #xts = xts.data.cpu().numpy()
+    #assert np.allclose(xps, xts, atol=1e-4)
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/SwinTransformer/port_weights/load_pytorch_weights_384.py b/image_classification/SwinTransformer/port_weights/load_pytorch_weights_384.py
new file mode 100644
index 00000000..c088c7d9
--- /dev/null
+++ b/image_classification/SwinTransformer/port_weights/load_pytorch_weights_384.py
@@ -0,0 +1,183 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from swin_transformer import *
+from config import *
+
+
+config = get_config('./configs/swin_base_patch4_window12_384.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('patch_embed.proj', 'patch_embedding.patch_embed'),
+        ('patch_embed.norm', 'patch_embedding.norm'),
+    ]
+    
+    # torch 'layers' to  paddle 'stages'
+    depths = config.MODEL.TRANS.STAGE_DEPTHS
+    num_stages = len(depths)
+    for stage_idx in range(num_stages):
+        pp_s_prefix = f'stages.{stage_idx}.blocks'
+        th_s_prefix = f'layers.{stage_idx}.blocks'
+        for block_idx in range(depths[stage_idx]):
+            th_b_prefix = f'{th_s_prefix}.{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                (f'{th_b_prefix}.attn.relative_position_bias_table', f'{pp_b_prefix}.attn.relative_position_bias_table'),
+                (f'{th_b_prefix}.attn.qkv', f'{pp_b_prefix}.attn.qkv'),
+                (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+            ]
+            mapping.extend(layer_mapping)
+        # stage downsample: last stage does not have downsample ops
+        if stage_idx < num_stages - 1:
+            mapping.extend([
+                (f'layers.{stage_idx}.downsample.reduction.weight', f'stages.{stage_idx}.downsample.reduction.weight'),
+                (f'layers.{stage_idx}.downsample.norm', f'stages.{stage_idx}.downsample.norm')])
+
+    mapping.extend([
+        ('norm', 'norm'),
+        ('head', 'fc')])
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_swin(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('swin_base_patch4_window12_384', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 384, 384).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-4)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./swin_base_patch4_window12_384.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+
+    #tmp = np.random.randn(1, 56, 128, 128).astype('float32')
+    #xp = paddle.to_tensor(tmp)
+    #xt = torch.Tensor(tmp).to(device)
+    #xps = paddle.roll(xp, shifts=(-3, -3), axis=(1,2))
+    #xts = torch.roll(xt,shifts=(-3, -3), dims=(1,2))
+    #xps = xps.cpu().numpy()
+    #xts = xts.data.cpu().numpy()
+    #assert np.allclose(xps, xts, atol=1e-4)
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/SwinTransformer/port_weights/load_pytorch_weights_large_384.py b/image_classification/SwinTransformer/port_weights/load_pytorch_weights_large_384.py
new file mode 100644
index 00000000..0e775ee9
--- /dev/null
+++ b/image_classification/SwinTransformer/port_weights/load_pytorch_weights_large_384.py
@@ -0,0 +1,185 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from swin_transformer import *
+from config import *
+
+
+config = get_config('./configs/swin_large_patch4_window12_384.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('patch_embed.proj', 'patch_embedding.patch_embed'),
+        ('patch_embed.norm', 'patch_embedding.norm'),
+    ]
+    
+    # torch 'layers' to  paddle 'stages'
+    depths = config.MODEL.TRANS.STAGE_DEPTHS
+    num_stages = len(depths)
+    for stage_idx in range(num_stages):
+        pp_s_prefix = f'stages.{stage_idx}.blocks'
+        th_s_prefix = f'layers.{stage_idx}.blocks'
+        for block_idx in range(depths[stage_idx]):
+            th_b_prefix = f'{th_s_prefix}.{block_idx}'
+            pp_b_prefix = f'{pp_s_prefix}.{block_idx}'
+            layer_mapping = [
+                (f'{th_b_prefix}.norm1', f'{pp_b_prefix}.norm1'),
+                (f'{th_b_prefix}.attn.relative_position_bias_table', f'{pp_b_prefix}.attn.relative_position_bias_table'),
+                (f'{th_b_prefix}.attn.qkv', f'{pp_b_prefix}.attn.qkv'),
+                (f'{th_b_prefix}.attn.proj', f'{pp_b_prefix}.attn.proj'),
+                (f'{th_b_prefix}.norm2', f'{pp_b_prefix}.norm2'),
+                (f'{th_b_prefix}.mlp.fc1', f'{pp_b_prefix}.mlp.fc1'),
+                (f'{th_b_prefix}.mlp.fc2', f'{pp_b_prefix}.mlp.fc2'),
+            ]
+            mapping.extend(layer_mapping)
+        # stage downsample: last stage does not have downsample ops
+        if stage_idx < num_stages - 1:
+            mapping.extend([
+                (f'layers.{stage_idx}.downsample.reduction.weight', f'stages.{stage_idx}.downsample.reduction.weight'),
+                (f'layers.{stage_idx}.downsample.norm', f'stages.{stage_idx}.downsample.norm')])
+
+    mapping.extend([
+        ('norm', 'norm'),
+        ('head', 'fc')])
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if th_name.endswith('relative_position_bias_table'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_swin(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('swin_large_patch4_window12_384', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 384, 384).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    print('|||||||||||||||||||||||||||||||||||||||||||||||||||')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:20])
+    print(out_paddle[0, 0:20])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-4)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./swin_large_patch4_window12_384.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+
+    #tmp = np.random.randn(1, 56, 128, 128).astype('float32')
+    #xp = paddle.to_tensor(tmp)
+    #xt = torch.Tensor(tmp).to(device)
+    #xps = paddle.roll(xp, shifts=(-3, -3), axis=(1,2))
+    #xts = torch.roll(xt,shifts=(-3, -3), dims=(1,2))
+    #xps = xps.cpu().numpy()
+    #xts = xts.data.cpu().numpy()
+    #assert np.allclose(xps, xts, atol=1e-4)
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/SwinTransformer/run_eval.sh b/image_classification/SwinTransformer/run_eval.sh
new file mode 100644
index 00000000..d5c304cc
--- /dev/null
+++ b/image_classification/SwinTransformer/run_eval.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/swin_base_patch4_window7_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./swin_base_patch4_window7_224' \
diff --git a/image_classification/SwinTransformer/run_eval_large_multi_384.sh b/image_classification/SwinTransformer/run_eval_large_multi_384.sh
new file mode 100644
index 00000000..5b49efc2
--- /dev/null
+++ b/image_classification/SwinTransformer/run_eval_large_multi_384.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/swin_large_patch4_window12_384.yaml' \
+-dataset='imagenet2012' \
+-batch_size=4 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./swin_large_patch4_window12_384' \
diff --git a/image_classification/SwinTransformer/run_eval_multi.sh b/image_classification/SwinTransformer/run_eval_multi.sh
new file mode 100644
index 00000000..e0547a65
--- /dev/null
+++ b/image_classification/SwinTransformer/run_eval_multi.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/swin_base_patch4_window7_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=16 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./swin_base_patch4_window7_224' \
diff --git a/image_classification/SwinTransformer/run_eval_multi_384.sh b/image_classification/SwinTransformer/run_eval_multi_384.sh
new file mode 100644
index 00000000..8cd53f45
--- /dev/null
+++ b/image_classification/SwinTransformer/run_eval_multi_384.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/swin_base_patch4_window12_384.yaml' \
+-dataset='imagenet2012' \
+-batch_size=8 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./swin_base_patch4_window12_384' \
diff --git a/image_classification/SwinTransformer/run_train.sh b/image_classification/SwinTransformer/run_train.sh
new file mode 100644
index 00000000..016141c3
--- /dev/null
+++ b/image_classification/SwinTransformer/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/swin_base_patch4_window7_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=4 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/SwinTransformer/run_train_multi.sh b/image_classification/SwinTransformer/run_train_multi.sh
new file mode 100644
index 00000000..ef47eed2
--- /dev/null
+++ b/image_classification/SwinTransformer/run_train_multi.sh
@@ -0,0 +1,7 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/swin_base_patch4_window7_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=16 \
+-data_path='/dataset/imagenet' \
+-ngpus=8
diff --git a/image_classification/SwinTransformer/swin.png b/image_classification/SwinTransformer/swin.png
new file mode 100644
index 00000000..0a45ee47
Binary files /dev/null and b/image_classification/SwinTransformer/swin.png differ
diff --git a/image_classification/SwinTransformer/swin_transformer.py b/image_classification/SwinTransformer/swin_transformer.py
new file mode 100644
index 00000000..554d2c0f
--- /dev/null
+++ b/image_classification/SwinTransformer/swin_transformer.py
@@ -0,0 +1,609 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement Transformer Class for Swin Transformer
+"""
+
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+
+    The output of this layer is the input without any change.
+    Use this layer to avoid if condition in some forward methods
+
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embeddings
+
+    Apply patch embeddings on input images. Embeddings is implemented using a Conv2D op.
+
+    Attributes:
+        image_size: int, input image size, default: 224
+        patch_size: int, size of patch, default: 4
+        in_channels: int, input image channels, default: 3
+        embed_dim: int, embedding dimension, default: 96
+    """
+
+    def __init__(self, image_size=224, patch_size=4, in_channels=3, embed_dim=96):
+        super().__init__()
+        image_size = (image_size, image_size) # TODO: add to_2tuple
+        patch_size = (patch_size, patch_size)
+        patches_resolution = [image_size[0]//patch_size[0], image_size[1]//patch_size[1]]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        self.patch_embed = nn.Conv2D(in_channels=in_channels,
+                                     out_channels=embed_dim,
+                                     kernel_size=patch_size,
+                                     stride=patch_size)
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        x = self.patch_embed(x) # [batch, embed_dim, h, w] h,w = patch_resolution
+        x = x.flatten(start_axis=2, stop_axis=-1) # [batch, embed_dim, h*w] h*w = num_patches
+        x = x.transpose([0, 2, 1]) # [batch, h*w, embed_dim]
+        x = self.norm(x) # [batch, num_patches, embed_dim]
+        return x
+
+
+class PatchMerging(nn.Layer):
+    """ Patch Merging class
+
+    Merge multiple patch into one path and keep the out dim.
+    Spefically, merge adjacent 2x2 patches(dim=C) into 1 patch.
+    The concat dim 4*C is rescaled to 2*C
+
+    Attributes:
+        input_resolution: tuple of ints, the size of input
+        dim: dimension of single patch
+        reduction: nn.Linear which maps 4C to 2C dim
+        norm: nn.LayerNorm, applied after linear layer.
+    """
+
+    def __init__(self, input_resolution, dim):
+        super(PatchMerging, self).__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4*dim, 2*dim, bias_attr=False)
+        self.norm = nn.LayerNorm(4*dim)
+
+    def forward(self, x):
+        h, w = self.input_resolution
+        b, _, c = x.shape
+        x = x.reshape([b, h, w, c])
+
+        x0 = x[:, 0::2, 0::2, :] # [B, H/2, W/2, C]
+        x1 = x[:, 1::2, 0::2, :] # [B, H/2, W/2, C]
+        x2 = x[:, 0::2, 1::2, :] # [B, H/2, W/2, C]
+        x3 = x[:, 1::2, 1::2, :] # [B, H/2, W/2, C]
+        x = paddle.concat([x0, x1, x2, x3], -1) #[B, H/2, W/2, 4*C]
+        x = x.reshape([b, -1, 4*c]) # [B, H/2*W/2, 4*C]
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+
+    def __init__(self, in_features, hidden_features, dropout):
+        super(Mlp, self).__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             in_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class WindowAttention(nn.Layer):
+    """Window based multihead attention, with relative position bias.
+
+    Both shifted window and non-shifted window are supported.
+
+    Attributes:
+        dim: int, input dimension (channels)
+        window_size: int, height and width of the window
+        num_heads: int, number of attention heads
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: True
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        attention_dropout: float, dropout of attention
+        dropout: float, dropout for output
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.):
+        super(WindowAttention, self).__init__()
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.dim = dim
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head ** -0.5
+
+        self.relative_position_bias_table = paddle.create_parameter(
+            shape=[(2 * window_size[0] -1) * (2 * window_size[1] - 1), num_heads],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))
+
+        # relative position index for each token inside window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid([coords_h, coords_w])) # [2, window_h, window_w]
+        coords_flatten = paddle.flatten(coords, 1) # [2, window_h * window_w]
+        # 2, window_h * window_w, window_h * window_h
+        relative_coords = coords_flatten.unsqueeze(2) - coords_flatten.unsqueeze(1)
+        # winwod_h*window_w, window_h*window_w, 2
+        relative_coords = relative_coords.transpose([1, 2, 0])
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2* self.window_size[1] - 1
+        # [window_size * window_size, window_size*window_size]
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def transpose_multihead(self, x):
+        new_shape = x.shape[:-1] + [self.num_heads, self.dim_head]
+        x = x.reshape(new_shape)
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def get_relative_pos_bias_from_pos_index(self):
+        # relative_position_bias_table is a ParamBase object
+        # https://github.com/PaddlePaddle/Paddle/blob/067f558c59b34dd6d8626aad73e9943cf7f5960f/python/paddle/fluid/framework.py#L5727
+        table = self.relative_position_bias_table # N x num_heads
+        # index is a tensor
+        index = self.relative_position_index.reshape([-1]) # window_h*window_w * window_h*window_w
+        # NOTE: paddle does NOT support indexing Tensor by a Tensor
+        relative_position_bias = paddle.index_select(x=table, index=index)
+        return relative_position_bias
+
+    def forward(self, x, mask=None):
+        qkv = self.qkv(x).chunk(3, axis=-1)
+        q, k, v = map(self.transpose_multihead, qkv)
+        q = q * self.scale
+        attn = paddle.matmul(q, k, transpose_y=True)
+
+        relative_position_bias = self.get_relative_pos_bias_from_pos_index()
+
+        relative_position_bias = relative_position_bias.reshape(
+            [self.window_size[0] * self.window_size[1],
+             self.window_size[0] * self.window_size[1],
+             -1])
+
+        # nH, window_h*window_w, window_h*window_w
+        relative_position_bias = relative_position_bias.transpose([2, 0, 1])
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape(
+                [x.shape[0] // nW, nW, self.num_heads, x.shape[1], x.shape[1]])
+            attn += mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, x.shape[1], x.shape[1]])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 2, 1, 3])
+        new_shape = z.shape[:-2] + [self.dim]
+        z = z.reshape(new_shape)
+        z = self.proj(z)
+        z = self.proj_dropout(z)
+
+        return z
+
+
+def windows_partition(x, window_size):
+    """ partite windows into window_size x window_size
+    Args:
+        x: Tensor, shape=[b, h, w, c]
+        window_size: int, window size
+    Returns:
+        x: Tensor, shape=[num_windows*b, window_size, window_size, c]
+    """
+
+    B, H, W, C = x.shape
+    x = x.reshape([B, H//window_size, window_size, W//window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5])
+    x = x.reshape([-1, window_size, window_size, C]) #(num_windows*B, window_size, window_size, C)
+
+    return x
+
+
+def windows_reverse(windows, window_size, H, W):
+    """ Window reverse
+    Args:
+        windows: (n_windows * B, window_size, window_size, C)
+        window_size: (int) window size
+        H: (int) height of image
+        W: (int) width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape([B, H // window_size, W // window_size, window_size, window_size, -1])
+    x = x.transpose([0, 1, 3, 2, 4, 5])
+    x = x.reshape([B, H, W, -1])
+    return x
+
+
+class SwinTransformerBlock(nn.Layer):
+    """Swin transformer block
+
+    Contains window multi head self attention, droppath, mlp, norm and residual.
+
+    Attributes:
+        dim: int, input dimension (channels)
+        input_resolution: int, input resoultion
+        num_heads: int, number of attention heads
+        windos_size: int, window size, default: 7
+        shift_size: int, shift size for SW-MSA, default: 0
+        mlp_ratio: float, ratio of mlp hidden dim and input embedding dim, default: 4.
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: True
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        dropout: float, dropout for output, default: 0.
+        attention_dropout: float, dropout of attention, default: 0.
+        droppath: float, drop path rate, default: 0.
+    """
+
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, dropout=0.,
+                 attention_dropout=0., droppath=0.):
+        super(SwinTransformerBlock, self).__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = WindowAttention(dim,
+                                    window_size=(self.window_size, self.window_size),
+                                    num_heads=num_heads,
+                                    qkv_bias=qkv_bias,
+                                    qk_scale=qk_scale,
+                                    attention_dropout=attention_dropout,
+                                    dropout=dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else None
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim*mlp_ratio),
+                       dropout=dropout)
+
+        if self.shift_size > 0:
+            H, W = self.input_resolution
+            img_mask = paddle.zeros((1, H, W, 1))
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            mask_windows = windows_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.reshape((-1, self.window_size * self.window_size))
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = paddle.where(attn_mask != 0,
+                                     paddle.ones_like(attn_mask) * float(-100.0),
+                                     attn_mask)
+            attn_mask = paddle.where(attn_mask == 0,
+                                     paddle.zeros_like(attn_mask),
+                                     attn_mask)
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        h = x
+        x = self.norm1(x)
+
+        new_shape = [B, H, W, C]
+        x = x.reshape(new_shape)
+
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(x,
+                                    shifts=(-self.shift_size, -self.shift_size),
+                                    axis=(1, 2))
+        else:
+            shifted_x = x
+
+        x_windows = windows_partition(shifted_x, self.window_size)
+        x_windows = x_windows.reshape([-1, self.window_size * self.window_size, C])
+
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)
+        attn_windows = attn_windows.reshape([-1, self.window_size, self.window_size, C])
+
+        shifted_x = windows_reverse(attn_windows, self.window_size, H, W)
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(shifted_x,
+                            shifts=(self.shift_size, self.shift_size),
+                            axis=(1, 2))
+        else:
+            x = shifted_x
+
+        x = x.reshape([B, H*W, C])
+
+        if self.drop_path is not None:
+            x = h + self.drop_path(x)
+        else:
+            x = h + x
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        if self.drop_path is not None:
+            x = h + self.drop_path(x)
+        else:
+            x = h + x
+
+        return x
+
+
+class SwinTransformerStage(nn.Layer):
+    """Stage layers for swin transformer
+
+    Stage layers contains a number of Transformer blocks and an optional
+    patch merging layer, patch merging is not applied after last stage
+
+    Attributes:
+        dim: int, embedding dimension
+        input_resolution: tuple, input resoliution
+        depth: list, num of blocks in each stage
+        blocks: nn.LayerList, contains SwinTransformerBlocks for one stage
+        downsample: PatchMerging, patch merging layer, none if last stage
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, dropout=0.,
+                 attention_dropout=0., droppath=0., downsample=None):
+        super(SwinTransformerStage, self).__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+
+        self.blocks = nn.LayerList()
+        for i in range(depth):
+            self.blocks.append(
+                SwinTransformerBlock(
+                    dim=dim, input_resolution=input_resolution,
+                    num_heads=num_heads, window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias, qk_scale=qk_scale,
+                    dropout=dropout, attention_dropout=attention_dropout,
+                    droppath=droppath[i] if isinstance(droppath, list) else droppath))
+
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return x
+
+
+class SwinTransformer(nn.Layer):
+    """SwinTransformer class
+
+    Attributes:
+        num_classes: int, num of image classes
+        num_stages: int, num of stages contains patch merging and Swin blocks
+        depths: list of int, num of Swin blocks in each stage
+        num_heads: int, num of heads in attention module
+        embed_dim: int, output dimension of patch embedding
+        num_features: int, output dimension of whole network before classifier
+        mlp_ratio: float, hidden dimension of mlp layer is mlp_ratio * mlp input dim
+        qkv_bias: bool, if True, set qkv layers have bias enabled
+        qk_scale: float, scale factor for qk.
+        ape: bool, if True, set to use absolute positional embeddings
+        window_size: int, size of patch window for inputs
+        dropout: float, dropout rate for linear layer
+        dropout_attn: float, dropout rate for attention
+        patch_embedding: PatchEmbedding, patch embedding instance
+        patch_resolution: tuple, number of patches in row and column
+        position_dropout: nn.Dropout, dropout op for position embedding
+        stages: SwinTransformerStage, stage instances.
+        norm: nn.LayerNorm, norm layer applied after transformer
+        avgpool: nn.AveragePool2D, pooling layer before classifer
+        fc: nn.Linear, classifier op.
+    """
+    def __init__(self,
+                 image_size=224,
+                 patch_size=4,
+                 in_channels=3,
+                 num_classes=1000,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.,
+                 ape=False):
+        super(SwinTransformer, self).__init__()
+
+        self.num_classes = num_classes 
+        self.num_stages = len(depths)
+        self.embed_dim = embed_dim 
+        self.num_features = int(self.embed_dim * 2 ** (self.num_stages - 1))
+        self.mlp_ratio = mlp_ratio
+        self.ape = ape
+
+        self.patch_embedding = PatchEmbedding(image_size=image_size,
+                                              patch_size=patch_size,
+                                              in_channels=in_channels,
+                                              embed_dim=embed_dim)
+        num_patches = self.patch_embedding.num_patches
+        self.patches_resolution = self.patch_embedding.patches_resolution
+
+
+        if self.ape:
+            self.absolute_positional_embedding = paddle.nn.ParameterList([
+                paddle.create_parameter(
+                    shape=[1, num_patches, self.embed_dim], dtype='float32',
+                    default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))])
+
+        self.position_dropout = nn.Dropout(dropout)
+
+        depth_decay = [x.item() for x in paddle.linspace(0, droppath, sum(depths))]
+
+        self.stages = nn.LayerList()
+        for stage_idx in range(self.num_stages):
+            stage = SwinTransformerStage(
+                dim=int(self.embed_dim * 2 ** stage_idx),
+                input_resolution=(
+                    self.patches_resolution[0] // (2 ** stage_idx),
+                    self.patches_resolution[1] // (2 ** stage_idx)),
+                depth=depths[stage_idx],
+                num_heads=num_heads[stage_idx],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                dropout=dropout,
+                attention_dropout=attention_dropout,
+                droppath=depth_decay[
+                    sum(depths[:stage_idx]):sum(depths[:stage_idx+1])],
+                downsample=PatchMerging if (
+                    stage_idx < self.num_stages-1) else None,
+                )
+            self.stages.append(stage)
+
+        self.norm = nn.LayerNorm(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1D(1)
+        self.fc = nn.Linear(self.num_features, self.num_classes)
+
+    def forward_features(self, x):
+        x = self.patch_embedding(x)
+        if self.ape:
+            x = x + self.absolute_positional_embedding
+        x = self.position_dropout(x)
+
+        for stage in self.stages:
+            x = stage(x)
+
+        x = self.norm(x)
+        x = x.transpose([0, 2, 1])
+        x = self.avgpool(x)
+        x = x.flatten(1)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.fc(x)
+        return x
+
+
+def build_swin(config):
+    model = SwinTransformer(
+        image_size=config.DATA.IMAGE_SIZE,
+        patch_size=config.MODEL.TRANS.PATCH_SIZE,
+        in_channels=config.MODEL.TRANS.IN_CHANNELS,
+        embed_dim=config.MODEL.TRANS.EMBED_DIM,
+        num_classes=config.MODEL.NUM_CLASSES,
+        depths=config.MODEL.TRANS.STAGE_DEPTHS,
+        num_heads=config.MODEL.TRANS.NUM_HEADS,
+        mlp_ratio=config.MODEL.TRANS.MLP_RATIO,
+        qkv_bias=config.MODEL.TRANS.QKV_BIAS,
+        qk_scale=config.MODEL.TRANS.QK_SCALE,
+        ape=config.MODEL.TRANS.APE,
+        window_size=config.MODEL.TRANS.WINDOW_SIZE,
+        dropout=config.MODEL.DROPOUT,
+        attention_dropout=config.MODEL.ATTENTION_DROPOUT,
+        droppath=config.MODEL.DROP_PATH)
+    return model
diff --git a/image_classification/SwinTransformer/utils.py b/image_classification/SwinTransformer/utils.py
new file mode 100644
index 00000000..44800527
--- /dev/null
+++ b/image_classification/SwinTransformer/utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/image_classification/T2T_ViT/README.md b/image_classification/T2T_ViT/README.md
new file mode 100644
index 00000000..b05e326f
--- /dev/null
+++ b/image_classification/T2T_ViT/README.md
@@ -0,0 +1,171 @@
+# Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet, [arxiv](https://arxiv.org/abs/2106.13797) 
+
+PaddlePaddle training/validation code and pretrained models for **T2T-ViT**.
+
+The official pytorch implementation is [here](https://github.com/yitu-opensource/T2T-ViT).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./t2t_vit.png" alt="drawing" width="100%" height="100%"/>
+    <h4 align="center">T2T-ViT Model Overview</h4>
+</p>
+
+
+### Update 
+Update (2021-08-18): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| t2t_vit_7      | 71.68 | 90.89 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1YkuPs1ku7B_udydOf_ls1LQvpJDg_c_j/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1jVNsz37gatLCDaOoU3NaMA)(1hpa) |
+| t2t_vit_10     | 75.15 | 92.80 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1H--55RxliMDlOCekn7FpKrHDGsUkyrJZ/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1nbdb4PFMq4nsIp8HrNxLQg)(ixug) |
+| t2t_vit_12     | 76.48 | 93.49 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1stnIwOwaescaEcztaF1QjI4NK4jaqN7P/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1DcMzq9WeSwrS3epv6jKJXw)(qpbb) |
+| t2t_vit_14     | 81.50 | 95.67 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1HSvN3Csgsy7SJbxJYbkzjUx9guftkfZ1/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1wcfh22uopBv7pS7rKcH_iw)(c2u8) |
+| t2t_vit_19     | 81.93 | 95.74 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1eFnhaL6I33pHCQw2BaEE0Oet9CnjmUf_/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1Hpyc5hBYo1zqoXWpryegnw)(4in3) |
+| t2t_vit_24     | 82.28 | 95.89 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1Z7nZCHeFp0AhIkGYcMAFkKdkGN0yXtpv/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1Hpyc5hBYo1zqoXWpryegnw)(4in3) |
+| t2t_vit_t_14   | 81.69 | 95.85 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/16li4voStt_B8eWDXqJt7s20OT_Z8L263/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1Hpyc5hBYo1zqoXWpryegnw)(4in3) |
+| t2t_vit_t_19   | 82.44 | 96.08 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1Ty-42SYOu15Nk8Uo6VRTJ7J0JV_6t7zJ/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1YdQd6l8tj5xMCWvcHWm7sg)(mier) |
+| t2t_vit_t_24   | 82.55 | 96.07 | 224   | 0.9      | bicubic       | [google](https://drive.google.com/file/d/1cvvXrGr2buB8Np2WlVL7n_F1_CnI1qow/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1BMU3KX_TRmPxQ1jN5cmWhg)(6vxc) |
+| t2t_vit_14_384 | 83.34 | 96.50 | 384   | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1Yuso8WD7Q8Lu_9I8dTvAvkcXXtPSkmnm/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1AOMhyVRF9zPqJe-lTrd7pw)(r685) |
+
+> *The results are evaluated on ImageNet2012 validation set.
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./t2t_vit_7.pdparams`, to use the `t2t_vit_7` model in python:
+```python
+from config import get_config
+from t2t_vit import build_t2t_vit as build_model
+# config files in ./configs/
+config = get_config('./configs/t2t_vit_7.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./t2t_vit_7')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate T2T-ViT model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/t2t_vit_7.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./t2t_vit_7'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/t2t_vit_7.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./t2t_vit_7'
+```
+
+</details>
+
+
+## Training
+To train the T2T-ViT Transformer model on ImageNet2012 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/t2t_vit_7.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=16 \
+  -data_path='/dataset/imagenet' \
+```
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python main_multi_gpu.py \
+    -cfg='./configs/t2t_vit_7.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=32 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@article{yuan2021tokens,
+  title={Tokens-to-token vit: Training vision transformers from scratch on imagenet},
+  author={Yuan, Li and Chen, Yunpeng and Wang, Tao and Yu, Weihao and Shi, Yujun and Jiang, Zihang and Tay, Francis EH and Feng, Jiashi and Yan, Shuicheng},
+  journal={arXiv preprint arXiv:2101.11986},
+  year={2021}
+}
+```
diff --git a/image_classification/T2T_ViT/config.py b/image_classification/T2T_ViT/config.py
new file mode 100644
index 00000000..3cfecca4
--- /dev/null
+++ b/image_classification/T2T_ViT/config.py
@@ -0,0 +1,147 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+
+"""
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 256 #256 # train batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #64 # val batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size: 224 for pretrain, 384 for finetune
+_C.DATA.CROP_PCT = 0.875 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads 
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'T2T-ViT'
+_C.MODEL.NAME = 'T2T-ViT'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.1
+_C.MODEL.ATTENTION_DROPOUT = 0.0
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.TOKEN_TYPE = 'performer' # now supports performer and transformer
+_C.MODEL.TRANS.EMBED_DIM = 768
+_C.MODEL.TRANS.MLP_RATIO = 4.
+_C.MODEL.TRANS.NUM_HEADS = 12
+_C.MODEL.TRANS.DEPTH = 12
+_C.MODEL.TRANS.QKV_BIAS = True
+_C.MODEL.TRANS.QK_SCALE = None
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size
+_C.TRAIN.WEIGHT_DECAY = 0.05 #0.3 # 0.0 for finetune
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0
+_C.TRAIN.END_LR = 5e-4
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 10 # freq to save chpt
+_C.REPORT_FREQ = 100 # freq to logging info
+_C.VALIDATE_FREQ = 50 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/T2T_ViT/configs/t2t_vit_10.yaml b/image_classification/T2T_ViT/configs/t2t_vit_10.yaml
new file mode 100644
index 00000000..dc151b56
--- /dev/null
+++ b/image_classification/T2T_ViT/configs/t2t_vit_10.yaml
@@ -0,0 +1,23 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: T2T-ViT
+    NAME: t2t-vit-10
+    TRANS:
+        EMBED_DIM: 256 
+        DEPTH: 10
+        NUM_HEADS: 4
+        MLP_RATIO: 2.0
+        QKV_BIAS: False
+        QK_SCALE: None #256 ** -0.5
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
+
+
diff --git a/image_classification/T2T_ViT/configs/t2t_vit_12.yaml b/image_classification/T2T_ViT/configs/t2t_vit_12.yaml
new file mode 100644
index 00000000..5605d872
--- /dev/null
+++ b/image_classification/T2T_ViT/configs/t2t_vit_12.yaml
@@ -0,0 +1,23 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: T2T-ViT
+    NAME: t2t-vit-12
+    TRANS:
+        EMBED_DIM: 256 
+        DEPTH: 12
+        NUM_HEADS: 4
+        MLP_RATIO: 2.0
+        QKV_BIAS: False
+        QK_SCALE: None #256 ** -0.5
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
+
+
diff --git a/image_classification/T2T_ViT/configs/t2t_vit_14.yaml b/image_classification/T2T_ViT/configs/t2t_vit_14.yaml
new file mode 100644
index 00000000..8b68be30
--- /dev/null
+++ b/image_classification/T2T_ViT/configs/t2t_vit_14.yaml
@@ -0,0 +1,23 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: T2T-ViT
+    NAME: t2t-vit-14
+    TRANS:
+        EMBED_DIM: 384 
+        DEPTH: 14
+        NUM_HEADS: 6
+        MLP_RATIO: 3.0
+        QKV_BIAS: False
+        QK_SCALE: None #0.05103103630798288 #384 ** -0.5
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
+
+
diff --git a/image_classification/T2T_ViT/configs/t2t_vit_14_384.yaml b/image_classification/T2T_ViT/configs/t2t_vit_14_384.yaml
new file mode 100644
index 00000000..df83aff5
--- /dev/null
+++ b/image_classification/T2T_ViT/configs/t2t_vit_14_384.yaml
@@ -0,0 +1,23 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: T2T-ViT
+    NAME: t2t-vit-14-384
+    TRANS:
+        EMBED_DIM: 384 
+        DEPTH: 14
+        NUM_HEADS: 6
+        MLP_RATIO: 3.0
+        QKV_BIAS: False
+        QK_SCALE: None
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
+
+
diff --git a/image_classification/T2T_ViT/configs/t2t_vit_19.yaml b/image_classification/T2T_ViT/configs/t2t_vit_19.yaml
new file mode 100644
index 00000000..9ace38e3
--- /dev/null
+++ b/image_classification/T2T_ViT/configs/t2t_vit_19.yaml
@@ -0,0 +1,23 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: T2T-ViT
+    NAME: t2t-vit-19
+    TRANS:
+        EMBED_DIM: 448
+        DEPTH: 19
+        NUM_HEADS: 7
+        MLP_RATIO: 3.0
+        QKV_BIAS: False
+        QK_SCALE: None
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
+
+
diff --git a/image_classification/T2T_ViT/configs/t2t_vit_24.yaml b/image_classification/T2T_ViT/configs/t2t_vit_24.yaml
new file mode 100644
index 00000000..bf3f7949
--- /dev/null
+++ b/image_classification/T2T_ViT/configs/t2t_vit_24.yaml
@@ -0,0 +1,23 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: T2T-ViT
+    NAME: t2t-vit-24
+    TRANS:
+        EMBED_DIM: 512 
+        DEPTH: 24
+        NUM_HEADS: 8
+        MLP_RATIO: 3.0
+        QKV_BIAS: False
+        QK_SCALE: None
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
+
+
diff --git a/image_classification/T2T_ViT/configs/t2t_vit_7.yaml b/image_classification/T2T_ViT/configs/t2t_vit_7.yaml
new file mode 100644
index 00000000..0ff59bef
--- /dev/null
+++ b/image_classification/T2T_ViT/configs/t2t_vit_7.yaml
@@ -0,0 +1,23 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: T2T-ViT
+    NAME: t2t-vit-7
+    TRANS:
+        EMBED_DIM: 256 
+        DEPTH: 7
+        NUM_HEADS: 4
+        MLP_RATIO: 2.0
+        QKV_BIAS: False
+        QK_SCALE: None #256 ** -0.5
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
+
+
diff --git a/image_classification/T2T_ViT/configs/t2t_vit_t_14.yaml b/image_classification/T2T_ViT/configs/t2t_vit_t_14.yaml
new file mode 100644
index 00000000..c2a400b4
--- /dev/null
+++ b/image_classification/T2T_ViT/configs/t2t_vit_t_14.yaml
@@ -0,0 +1,24 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: T2T-ViT
+    NAME: t2t-vit-t-14
+    TRANS:
+        TOKEN_TYPE: transformer
+        EMBED_DIM: 384 
+        DEPTH: 14
+        NUM_HEADS: 6
+        MLP_RATIO: 3.0
+        QKV_BIAS: False
+        QK_SCALE: None #0.05103103630798288 #384 ** -0.5
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
+
+
diff --git a/image_classification/T2T_ViT/configs/t2t_vit_t_19.yaml b/image_classification/T2T_ViT/configs/t2t_vit_t_19.yaml
new file mode 100644
index 00000000..1ef65948
--- /dev/null
+++ b/image_classification/T2T_ViT/configs/t2t_vit_t_19.yaml
@@ -0,0 +1,24 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: T2T-ViT
+    NAME: t2t-vit-t-19
+    TRANS:
+        TOKEN_TYPE: transformer
+        EMBED_DIM: 448 
+        DEPTH: 19
+        NUM_HEADS: 7
+        MLP_RATIO: 3.0
+        QKV_BIAS: False
+        QK_SCALE: None
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
+
+
diff --git a/image_classification/T2T_ViT/configs/t2t_vit_t_24.yaml b/image_classification/T2T_ViT/configs/t2t_vit_t_24.yaml
new file mode 100644
index 00000000..b194addf
--- /dev/null
+++ b/image_classification/T2T_ViT/configs/t2t_vit_t_24.yaml
@@ -0,0 +1,24 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.9
+MODEL:
+    TYPE: T2T-ViT
+    NAME: t2t-vit-t-24
+    TRANS:
+        TOKEN_TYPE: transformer
+        EMBED_DIM: 512 
+        DEPTH: 24
+        NUM_HEADS: 8
+        MLP_RATIO: 3.0
+        QKV_BIAS: False
+        QK_SCALE: None
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
+
+
diff --git a/image_classification/T2T_ViT/datasets.py b/image_classification/T2T_ViT/datasets.py
new file mode 100644
index 00000000..78a3db09
--- /dev/null
+++ b/image_classification/T2T_ViT/datasets.py
@@ -0,0 +1,188 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, 'bicubic'),
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/T2T_ViT/droppath.py b/image_classification/T2T_ViT/droppath.py
new file mode 100644
index 00000000..25b8d5ff
--- /dev/null
+++ b/image_classification/T2T_ViT/droppath.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import paddle
+import paddle.nn as nn
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def drop_path(self, inputs):
+        """drop path op
+        Args:
+            input: tensor with arbitrary shape
+            drop_prob: float number of drop path probability, default: 0.0
+            training: bool, if current mode is training, default: False
+        Returns:
+            output: output tensor after drop path
+        """
+        # if prob is 0 or eval mode, return original input
+        if self.drop_prob == 0. or not self.training:
+            return inputs
+        keep_prob = 1 - self.drop_prob
+        keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+        random_tensor = random_tensor.floor() # mask
+        output = inputs.divide(keep_prob) * random_tensor #divide is to keep same output expectation
+        return output
+
+    def forward(self, inputs):
+        return self.drop_path(inputs)
+
+
+#def main():
+#    tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32')
+#    dp = DropPath(0.5)
+#    out = dp(tmp)
+#    print(out)
+#
+#if __name__ == "__main__":
+#    main()
diff --git a/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_10.py b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_10.py
new file mode 100644
index 00000000..dbe3d7a4
--- /dev/null
+++ b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_10.py
@@ -0,0 +1,200 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from T2T_ViT_torch.models.t2t_vit import *
+from T2T_ViT_torch.utils import load_for_transfer_learning 
+from t2t_vit import build_t2t_vit as build_model
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/t2t_vit_10.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+    ]
+
+    for idx in range(1, 3):
+        th_prefix = f'tokens_to_token.attention{idx}'
+        pp_prefix = f'patch_embed.attn{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.w', f'{pp_prefix}.w'),
+            (f'{th_prefix}.kqv', f'{pp_prefix}.kqv'),
+            (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.0', f'{pp_prefix}.mlp.0'),
+            (f'{th_prefix}.mlp.2', f'{pp_prefix}.mlp.2'),
+        ]
+        mapping.extend(layer_mapping)
+    mapping.append(('tokens_to_token.project','patch_embed.proj'))
+
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params: # nn.Parameters
+            if th_name.endswith('w'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params:
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_model(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model_path = './T2T_ViT_torch/t2t-vit-pth-models/75.2_T2T_ViT_10.pth.tar'
+    torch_model = t2t_vit_10()
+    load_for_transfer_learning(torch_model,
+                               torch_model_path,
+                               use_ema=True,
+                               strict=False,
+                               num_classes=1000)
+
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./t2t_vit_10.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_12.py b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_12.py
new file mode 100644
index 00000000..5dfdf6d3
--- /dev/null
+++ b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_12.py
@@ -0,0 +1,200 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from T2T_ViT_torch.models.t2t_vit import *
+from T2T_ViT_torch.utils import load_for_transfer_learning 
+from t2t_vit import build_t2t_vit as build_model
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/t2t_vit_12.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+    ]
+
+    for idx in range(1, 3):
+        th_prefix = f'tokens_to_token.attention{idx}'
+        pp_prefix = f'patch_embed.attn{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.w', f'{pp_prefix}.w'),
+            (f'{th_prefix}.kqv', f'{pp_prefix}.kqv'),
+            (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.0', f'{pp_prefix}.mlp.0'),
+            (f'{th_prefix}.mlp.2', f'{pp_prefix}.mlp.2'),
+        ]
+        mapping.extend(layer_mapping)
+    mapping.append(('tokens_to_token.project','patch_embed.proj'))
+
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params: # nn.Parameters
+            if th_name.endswith('w'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params:
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_model(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model_path = './T2T_ViT_torch/t2t-vit-pth-models/76.5_T2T_ViT_12.pth.tar'
+    torch_model = t2t_vit_12()
+    load_for_transfer_learning(torch_model,
+                               torch_model_path,
+                               use_ema=True,
+                               strict=False,
+                               num_classes=1000)
+
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./t2t_vit_12.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_14.py b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_14.py
new file mode 100644
index 00000000..67e2cce5
--- /dev/null
+++ b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_14.py
@@ -0,0 +1,200 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from T2T_ViT_torch.models.t2t_vit import *
+from T2T_ViT_torch.utils import load_for_transfer_learning 
+from t2t_vit import build_t2t_vit as build_model
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/t2t_vit_14.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+    ]
+
+    for idx in range(1, 3):
+        th_prefix = f'tokens_to_token.attention{idx}'
+        pp_prefix = f'patch_embed.attn{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.w', f'{pp_prefix}.w'),
+            (f'{th_prefix}.kqv', f'{pp_prefix}.kqv'),
+            (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.0', f'{pp_prefix}.mlp.0'),
+            (f'{th_prefix}.mlp.2', f'{pp_prefix}.mlp.2'),
+        ]
+        mapping.extend(layer_mapping)
+    mapping.append(('tokens_to_token.project','patch_embed.proj'))
+
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params: # nn.Parameters
+            if th_name.endswith('w'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params:
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_model(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model_path = './T2T_ViT_torch/t2t-vit-pth-models/81.5_T2T_ViT_14.pth.tar'
+    torch_model = t2t_vit_14()
+    load_for_transfer_learning(torch_model,
+                               torch_model_path,
+                               use_ema=True,
+                               strict=False,
+                               num_classes=1000)
+
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./t2t_vit_14.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_14_384.py b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_14_384.py
new file mode 100644
index 00000000..882ac4a6
--- /dev/null
+++ b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_14_384.py
@@ -0,0 +1,200 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from T2T_ViT_torch.models.t2t_vit import *
+from T2T_ViT_torch.utils import load_for_transfer_learning 
+from t2t_vit import build_t2t_vit as build_model
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/t2t_vit_14_384.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+    ]
+
+    for idx in range(1, 3):
+        th_prefix = f'tokens_to_token.attention{idx}'
+        pp_prefix = f'patch_embed.attn{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.w', f'{pp_prefix}.w'),
+            (f'{th_prefix}.kqv', f'{pp_prefix}.kqv'),
+            (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.0', f'{pp_prefix}.mlp.0'),
+            (f'{th_prefix}.mlp.2', f'{pp_prefix}.mlp.2'),
+        ]
+        mapping.extend(layer_mapping)
+    mapping.append(('tokens_to_token.project','patch_embed.proj'))
+
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params: # nn.Parameters
+            if th_name.endswith('w'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params:
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_model(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model_path = './T2T_ViT_torch/t2t-vit-pth-models/83.3_T2T_ViT_14.pth.tar'
+    torch_model = t2t_vit_14(img_size=384)
+    load_for_transfer_learning(torch_model,
+                               torch_model_path,
+                               use_ema=True,
+                               strict=False,
+                               num_classes=1000)
+
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 384, 384).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./t2t_vit_14_384.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_19.py b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_19.py
new file mode 100644
index 00000000..3d1fd7f7
--- /dev/null
+++ b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_19.py
@@ -0,0 +1,200 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from T2T_ViT_torch.models.t2t_vit import *
+from T2T_ViT_torch.utils import load_for_transfer_learning 
+from t2t_vit import build_t2t_vit as build_model
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/t2t_vit_19.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+    ]
+
+    for idx in range(1, 3):
+        th_prefix = f'tokens_to_token.attention{idx}'
+        pp_prefix = f'patch_embed.attn{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.w', f'{pp_prefix}.w'),
+            (f'{th_prefix}.kqv', f'{pp_prefix}.kqv'),
+            (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.0', f'{pp_prefix}.mlp.0'),
+            (f'{th_prefix}.mlp.2', f'{pp_prefix}.mlp.2'),
+        ]
+        mapping.extend(layer_mapping)
+    mapping.append(('tokens_to_token.project','patch_embed.proj'))
+
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params: # nn.Parameters
+            if th_name.endswith('w'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params:
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_model(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model_path = './T2T_ViT_torch/t2t-vit-pth-models/81.9_T2T_ViT_19.pth.tar'
+    torch_model = t2t_vit_19()
+    load_for_transfer_learning(torch_model,
+                               torch_model_path,
+                               use_ema=True,
+                               strict=False,
+                               num_classes=1000)
+
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./t2t_vit_19.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_24.py b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_24.py
new file mode 100644
index 00000000..5e56cbab
--- /dev/null
+++ b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_24.py
@@ -0,0 +1,200 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from T2T_ViT_torch.models.t2t_vit import *
+from T2T_ViT_torch.utils import load_for_transfer_learning 
+from t2t_vit import build_t2t_vit as build_model
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/t2t_vit_24.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+    ]
+
+    for idx in range(1, 3):
+        th_prefix = f'tokens_to_token.attention{idx}'
+        pp_prefix = f'patch_embed.attn{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.w', f'{pp_prefix}.w'),
+            (f'{th_prefix}.kqv', f'{pp_prefix}.kqv'),
+            (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.0', f'{pp_prefix}.mlp.0'),
+            (f'{th_prefix}.mlp.2', f'{pp_prefix}.mlp.2'),
+        ]
+        mapping.extend(layer_mapping)
+    mapping.append(('tokens_to_token.project','patch_embed.proj'))
+
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params: # nn.Parameters
+            if th_name.endswith('w'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params:
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_model(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model_path = './T2T_ViT_torch/t2t-vit-pth-models/82.3_T2T_ViT_24.pth.tar'
+    torch_model = t2t_vit_24()
+    load_for_transfer_learning(torch_model,
+                               torch_model_path,
+                               use_ema=True,
+                               strict=False,
+                               num_classes=1000)
+
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./t2t_vit_24.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_7.py b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_7.py
new file mode 100644
index 00000000..fafce8a7
--- /dev/null
+++ b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_7.py
@@ -0,0 +1,200 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from T2T_ViT_torch.models.t2t_vit import *
+from T2T_ViT_torch.utils import load_for_transfer_learning 
+from t2t_vit import build_t2t_vit as build_model
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/t2t_vit_7.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+    ]
+
+    for idx in range(1, 3):
+        th_prefix = f'tokens_to_token.attention{idx}'
+        pp_prefix = f'patch_embed.attn{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.w', f'{pp_prefix}.w'),
+            (f'{th_prefix}.kqv', f'{pp_prefix}.kqv'),
+            (f'{th_prefix}.proj', f'{pp_prefix}.proj'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.0', f'{pp_prefix}.mlp.0'),
+            (f'{th_prefix}.mlp.2', f'{pp_prefix}.mlp.2'),
+        ]
+        mapping.extend(layer_mapping)
+    mapping.append(('tokens_to_token.project','patch_embed.proj'))
+
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params: # nn.Parameters
+            if th_name.endswith('w'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params:
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_model(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model_path = './T2T_ViT_torch/t2t-vit-pth-models/71.7_T2T_ViT_7.pth.tar'
+    torch_model = t2t_vit_7()
+    load_for_transfer_learning(torch_model,
+                               torch_model_path,
+                               use_ema=True,
+                               strict=False,
+                               num_classes=1000)
+
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./t2t_vit_7.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_t_14.py b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_t_14.py
new file mode 100644
index 00000000..583cf9dd
--- /dev/null
+++ b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_t_14.py
@@ -0,0 +1,199 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from T2T_ViT_torch.models.t2t_vit import *
+from T2T_ViT_torch.utils import load_for_transfer_learning 
+from t2t_vit import build_t2t_vit as build_model
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/t2t_vit_t_14.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+    ]
+
+    for idx in range(1, 3):
+        th_prefix = f'tokens_to_token.attention{idx}'
+        pp_prefix = f'patch_embed.attn{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'),
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+    mapping.append(('tokens_to_token.project','patch_embed.proj'))
+
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params: # nn.Parameters
+            if th_name.endswith('w'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params:
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_model(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model_path = './T2T_ViT_torch/t2t-vit-pth-models/81.7_T2T_ViTt_14.pth.tar'
+    torch_model = t2t_vit_t_14()
+    load_for_transfer_learning(torch_model,
+                               torch_model_path,
+                               use_ema=True,
+                               strict=False,
+                               num_classes=1000)
+
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./t2t_vit_t_14.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_t_19.py b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_t_19.py
new file mode 100644
index 00000000..2d56aacd
--- /dev/null
+++ b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_t_19.py
@@ -0,0 +1,199 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from T2T_ViT_torch.models.t2t_vit import *
+from T2T_ViT_torch.utils import load_for_transfer_learning 
+from t2t_vit import build_t2t_vit as build_model
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/t2t_vit_t_19.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+    ]
+
+    for idx in range(1, 3):
+        th_prefix = f'tokens_to_token.attention{idx}'
+        pp_prefix = f'patch_embed.attn{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'),
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+    mapping.append(('tokens_to_token.project','patch_embed.proj'))
+
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params: # nn.Parameters
+            if th_name.endswith('w'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params:
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_model(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model_path = './T2T_ViT_torch/t2t-vit-pth-models/82.4_T2T_ViTt_19.pth.tar'
+    torch_model = t2t_vit_t_19()
+    load_for_transfer_learning(torch_model,
+                               torch_model_path,
+                               use_ema=True,
+                               strict=False,
+                               num_classes=1000)
+
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./t2t_vit_t_19.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_t_24.py b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_t_24.py
new file mode 100644
index 00000000..ce2ec2a5
--- /dev/null
+++ b/image_classification/T2T_ViT/load_pth_weights/load_pytorch_weights_t_24.py
@@ -0,0 +1,199 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from T2T_ViT_torch.models.t2t_vit import *
+from T2T_ViT_torch.utils import load_for_transfer_learning 
+from t2t_vit import build_t2t_vit as build_model
+from config import *
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/t2t_vit_t_24.yaml')
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+    ]
+
+    for idx in range(1, 3):
+        th_prefix = f'tokens_to_token.attention{idx}'
+        pp_prefix = f'patch_embed.attn{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'),
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+    mapping.append(('tokens_to_token.project','patch_embed.proj'))
+
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        th_prefix = f'blocks.{idx}'
+        pp_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, no_transpose=False):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            if not no_transpose:
+                value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params: # nn.Parameters
+            if th_name.endswith('w'):
+                _set_value(th_name, pd_name, no_transpose=True)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params:
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_model(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('--------------')
+    print_model_named_buffers(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model_path = './T2T_ViT_torch/t2t-vit-pth-models/82.6_T2T_ViTt_24.pth.tar'
+    torch_model = t2t_vit_t_24()
+    load_for_transfer_learning(torch_model,
+                               torch_model_path,
+                               use_ema=True,
+                               strict=False,
+                               num_classes=1000)
+
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print('--------------')
+    print_model_named_buffers(torch_model)
+    print('----------------------------------')
+
+
+    #return
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./t2t_vit_t_24.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/main_multi_gpu.py b/image_classification/T2T_ViT/main_multi_gpu.py
new file mode 100644
index 00000000..616ee793
--- /dev/null
+++ b/image_classification/T2T_ViT/main_multi_gpu.py
@@ -0,0 +1,364 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""T2T-ViT Transformer training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import copy
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader
+from datasets import get_dataset
+from t2t_vit import build_t2t_vit as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('T2T-ViT Transformer')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/main_single_gpu.py b/image_classification/T2T_ViT/main_single_gpu.py
new file mode 100644
index 00000000..00ed8711
--- /dev/null
+++ b/image_classification/T2T_ViT/main_single_gpu.py
@@ -0,0 +1,333 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""T2T-ViT Transformer training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from t2t_vit import build_t2t_vit as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('T2T-ViT Transformer')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+    # 1. Create model
+    model = build_model(config)
+    #model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_dict(opt_state)
+        logger.info(
+            f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  )
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/T2T_ViT/run_eval_7.sh b/image_classification/T2T_ViT/run_eval_7.sh
new file mode 100644
index 00000000..c4259f9c
--- /dev/null
+++ b/image_classification/T2T_ViT/run_eval_7.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/t2t_vit_7.yaml' \
+-dataset='imagenet2012' \
+-batch_size=256 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./t2t_vit_7'
diff --git a/image_classification/T2T_ViT/run_eval_multi_10.sh b/image_classification/T2T_ViT/run_eval_multi_10.sh
new file mode 100644
index 00000000..3146c8e8
--- /dev/null
+++ b/image_classification/T2T_ViT/run_eval_multi_10.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/t2t_vit_10.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./t2t_vit_10' \
diff --git a/image_classification/T2T_ViT/run_eval_multi_12.sh b/image_classification/T2T_ViT/run_eval_multi_12.sh
new file mode 100644
index 00000000..d27cd4ac
--- /dev/null
+++ b/image_classification/T2T_ViT/run_eval_multi_12.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/t2t_vit_12.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./t2t_vit_12' \
diff --git a/image_classification/T2T_ViT/run_eval_multi_14.sh b/image_classification/T2T_ViT/run_eval_multi_14.sh
new file mode 100644
index 00000000..3a3c755a
--- /dev/null
+++ b/image_classification/T2T_ViT/run_eval_multi_14.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/t2t_vit_14.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./t2t_vit_14' \
diff --git a/image_classification/T2T_ViT/run_eval_multi_14_384.sh b/image_classification/T2T_ViT/run_eval_multi_14_384.sh
new file mode 100644
index 00000000..d58281bb
--- /dev/null
+++ b/image_classification/T2T_ViT/run_eval_multi_14_384.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/t2t_vit_14_384.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./t2t_vit_14_384' \
diff --git a/image_classification/T2T_ViT/run_eval_multi_19.sh b/image_classification/T2T_ViT/run_eval_multi_19.sh
new file mode 100644
index 00000000..2f729f9c
--- /dev/null
+++ b/image_classification/T2T_ViT/run_eval_multi_19.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/t2t_vit_19.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./t2t_vit_19' \
diff --git a/image_classification/T2T_ViT/run_eval_multi_24.sh b/image_classification/T2T_ViT/run_eval_multi_24.sh
new file mode 100644
index 00000000..d904bc3e
--- /dev/null
+++ b/image_classification/T2T_ViT/run_eval_multi_24.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/t2t_vit_24.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./t2t_vit_24' \
diff --git a/image_classification/T2T_ViT/run_eval_multi_7.sh b/image_classification/T2T_ViT/run_eval_multi_7.sh
new file mode 100644
index 00000000..85ea8f36
--- /dev/null
+++ b/image_classification/T2T_ViT/run_eval_multi_7.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/t2t_vit_7.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./t2t_vit_7' \
diff --git a/image_classification/T2T_ViT/run_eval_multi_t_14.sh b/image_classification/T2T_ViT/run_eval_multi_t_14.sh
new file mode 100644
index 00000000..ca05141d
--- /dev/null
+++ b/image_classification/T2T_ViT/run_eval_multi_t_14.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/t2t_vit_t_14.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./t2t_vit_t_14' \
diff --git a/image_classification/T2T_ViT/run_eval_multi_t_19.sh b/image_classification/T2T_ViT/run_eval_multi_t_19.sh
new file mode 100644
index 00000000..9f8c5321
--- /dev/null
+++ b/image_classification/T2T_ViT/run_eval_multi_t_19.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/t2t_vit_t_19.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./t2t_vit_t_19' \
diff --git a/image_classification/T2T_ViT/run_eval_multi_t_24.sh b/image_classification/T2T_ViT/run_eval_multi_t_24.sh
new file mode 100644
index 00000000..f96402d9
--- /dev/null
+++ b/image_classification/T2T_ViT/run_eval_multi_t_24.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/t2t_vit_t_24.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./t2t_vit_t_24' \
diff --git a/image_classification/T2T_ViT/run_train.sh b/image_classification/T2T_ViT/run_train.sh
new file mode 100644
index 00000000..65ee7da8
--- /dev/null
+++ b/image_classification/T2T_ViT/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/t2t_vit_7.yaml' \
+-dataset='imagenet2012' \
+-batch_size=8 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/T2T_ViT/run_train_multi.sh b/image_classification/T2T_ViT/run_train_multi.sh
new file mode 100644
index 00000000..cbdcb75a
--- /dev/null
+++ b/image_classification/T2T_ViT/run_train_multi.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/t2t_vit_7.yaml' \
+-dataset='imagenet2012' \
+-batch_size=16 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/T2T_ViT/t2t_vit.png b/image_classification/T2T_ViT/t2t_vit.png
new file mode 100644
index 00000000..36bc6cf9
Binary files /dev/null and b/image_classification/T2T_ViT/t2t_vit.png differ
diff --git a/image_classification/T2T_ViT/t2t_vit.py b/image_classification/T2T_ViT/t2t_vit.py
new file mode 100644
index 00000000..030dfe8c
--- /dev/null
+++ b/image_classification/T2T_ViT/t2t_vit.py
@@ -0,0 +1,571 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement T2T-ViT Transformer
+"""
+
+import copy
+import math
+#from scipy.stats import ortho_group
+import numpy as np
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+
+    The output of this layer is the input without any change.
+    Use this layer to avoid using 'if' condition in forward methods
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embeddings
+
+    Apply patch embeddings (tokens-to-token) on input images. Embeddings is
+    implemented using one of the following ops: Performer, Transformer.
+
+    Attributes:
+        image_size: int, input image size, default: 224
+        token_type: string, type of token embedding, in ['performer', 'transformer', 'convolution'], default: 'performer'
+        patch_size: int, size of patch, default: 4
+        in_channels: int, input image channels, default: 3
+        embed_dim: int, embedding dimension, default: 96
+        token_dim: int, intermediate dim for patch_embedding module, default: 64
+    """
+    def __init__(self,
+                 image_size=224,
+                 token_type='performer',
+                 in_channels=3,
+                 embed_dim=768,
+                 token_dim=64):
+        super().__init__()
+        if token_type == 'transformer':
+            # paddle v 2.1 has bugs on nn.Unfold,
+            # use paddle.nn.functional.unfold method instead
+            # replacements see forward method.
+            #self.soft_split0 = nn.Unfold(kernel_size=7, strides=4, paddings=2)
+            #self.soft_split1 = nn.Unfold(kernel_size=3, strides=2, paddings=1)
+            #self.soft_split2 = nn.Unfold(kernel_size=3, strides=2, paddings=1)
+
+            self.attn1 = TokenTransformer(dim=in_channels * 7 * 7,
+                                          in_dim=token_dim,
+                                          num_heads=1,
+                                          mlp_ratio=1.0)
+            self.attn2 = TokenTransformer(dim=token_dim * 3 * 3,
+                                          in_dim=token_dim,
+                                          num_heads=1,
+                                          mlp_ratio=1.0)
+
+            self.proj = nn.Linear(token_dim * 3 * 3, embed_dim)
+
+        elif token_type == 'performer':
+            # paddle v 2.1 has bugs on nn.Unfold,
+            # use paddle.nn.functional.unfold method instead
+            # replacements see forward method.
+            #self.soft_split0 = nn.Unfold(kernel_sizes=7, strides=4, paddings=2)
+            #self.soft_split1 = nn.Unfold(kernel_sizes=3, strides=2, paddings=1)
+            #self.soft_split2 = nn.Unfold(kernel_sizes=3, strides=2, paddings=1)
+
+            self.attn1 = TokenPerformer(dim=in_channels * 7 * 7,
+                                        in_dim=token_dim,
+                                        kernel_ratio=0.5)
+            self.attn2 = TokenPerformer(dim=token_dim * 3 * 3,
+                                        in_dim=token_dim,
+                                        kernel_ratio=0.5)
+
+            self.proj = nn.Linear(token_dim * 3 * 3, embed_dim)
+
+        elif token_type == 'convolution': # NOTE: currently not supported!!!
+            # 1st conv
+            self.soft_split0 = nn.Conv2D(in_channels=in_channels,
+                                         out_channels=token_dim,
+                                         kernel_size=7,
+                                         stride=4,
+                                         padding=2)
+            # 2nd conv
+            self.soft_split1 = nn.Conv2D(in_channels=token_dim,
+                                         out_channels=token_dim,
+                                         kernel_size=3,
+                                         stride=2,
+                                         padding=1)
+            # 3rd conv
+            self.proj = nn.Conv2D(in_channels=token_dim,
+                                  out_channels=embed_dim,
+                                  kernel_size=3,
+                                  stride=2,
+                                  padding=1)
+        else:
+            raise ValueError(f'token_type: {token_type} is not supported!')
+
+        # 3 soft splits, each has stride 4, 2, 2, respectively.
+        self.num_patches = (image_size // (4 * 2 * 2)) * (image_size // (4 * 2 * 2))
+
+    def forward(self, x):
+        # x = self.soft_split0(x)
+        # input x: [B, C, IMAGE_H, IMAGE_W]
+        x = paddle.nn.functional.unfold(x, kernel_sizes=7, strides=4, paddings=2)
+        # unfolded x: [B, C * k * k, k * k * num_patches]
+        x = x.transpose([0, 2, 1])
+        # transposed x: [B, k * k * num_patches, C * k * k]
+
+        x = self.attn1(x)
+        B, HW, C = x.shape
+        x = x.transpose([0, 2, 1])
+        x = x.reshape([B, C, int(np.sqrt(HW)), int(np.sqrt(HW))])
+        #x = self.soft_split1(x)
+        x = paddle.nn.functional.unfold(x, kernel_sizes=3, strides=2, paddings=1)
+        x = x.transpose([0, 2, 1])
+
+        x = self.attn2(x)
+        B, HW, C = x.shape
+        x = x.transpose([0, 2, 1])
+        x = x.reshape([B, C, int(np.sqrt(HW)), int(np.sqrt(HW))])
+        #x = self.soft_split2(x)
+        x = paddle.nn.functional.unfold(x, kernel_sizes=3, strides=2, paddings=1)
+        x = x.transpose([0, 2, 1])
+
+        x = self.proj(x)
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+
+    def __init__(self, in_features, hidden_features=None, out_features=None, dropout=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             out_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class Attention(nn.Layer):
+    """ Self-Attention
+
+    Args:
+        dim: int, all heads dimension
+        dim_head: int, single heads dimension, default: None
+        num_heads: int, num of heads
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+        skip_connection: bool, if Ture, use v to do skip connection, used in TokenTransformer
+    """
+    def __init__(self,
+                 dim,
+                 in_dim=None,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.,
+                 skip_connection=False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.in_dim = in_dim or dim
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head ** -0.5
+        # same as original repo
+        self.qkv = nn.Linear(dim, self.in_dim * 3, bias_attr=qkv_bias)
+
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.proj = nn.Linear(self.in_dim, self.in_dim)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+        # use V to do skip connection, used in TokenTransformer
+        self.skip = skip_connection
+
+    def transpose_multihead(self, x):
+        if self.skip: # token transformer
+            new_shape = x.shape[:-1] + [self.num_heads, self.in_dim]
+        else: # regular attention
+            new_shape = x.shape[:-1] + [self.num_heads, self.dim_head]
+        x = x.reshape(new_shape)
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def forward(self, x):
+        B, H, C = x.shape
+        qkv = self.qkv(x).chunk(3, axis=-1)
+        q, k, v = map(self.transpose_multihead, qkv)
+        q = q * self.scale
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 2, 1, 3])
+        if self.skip: # token transformer
+            z = z.reshape([B, -1, self.in_dim])
+        else: # regular attention
+            z = z.reshape([B, -1, C])
+        z = self.proj(z)
+        z = self.proj_dropout(z)
+
+        # skip connection
+        if self.skip:
+            z = z + v.squeeze(1)
+
+        return z
+
+
+class Block(nn.Layer):
+    """ Transformer block layers
+
+    Transformer block layers contains regular self-attention layers,
+    mlp layers, norms layers and residual blocks.
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        mlp_ratio: ratio to multiply on mlp input dim as mlp hidden dim, default: 4.
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, scale factor to replace dim_head ** -0.5, default: None
+        dropout: float, dropout rate for projection dropout, default: 0.
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        droppath: float, drop path rate, default: 0.
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.attn = Attention(dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              dropout=dropout,
+                              attention_dropout=attention_dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+        self.norm2 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       dropout=dropout)
+
+    def forward(self, x):
+        h = x
+        x = self.norm1(x)
+        x = self.attn(x)
+        x = self.drop_path(x)
+        x = h + x
+
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = h + x
+        return x
+
+
+class TokenPerformer(nn.Layer):
+    """ Token Performer layers
+
+    Performer layers contains single-attention layers,
+    mlp layers, norms layers and residual blocks. This module
+    is used in 'tokens-to-token', which converts image into tokens
+    and gradually tokenized the tokens.
+
+    Args:
+        dim: int, all heads dimension
+        in_dim: int, qkv and out dimension in attention
+        num_heads: int, num of heads
+        kernel_ratio: ratio to multiply on prm input dim, default: 0.5.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+    def __init__(self, dim, in_dim, num_heads=1, kernel_ratio=0.5, dropout=0.1):
+        super().__init__()
+        self.embed_dim = in_dim * num_heads
+        self.kqv = nn.Linear(dim, 3 * self.embed_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.num_heads = num_heads
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.norm2 = nn.LayerNorm(self.embed_dim, epsilon=1e-6)
+
+        self.mlp = nn.Sequential(nn.Linear(self.embed_dim, self.embed_dim),
+                                 nn.GELU(),
+                                 nn.Linear(self.embed_dim, self.embed_dim),
+                                 nn.Dropout(dropout))
+
+        self.m = int(self.embed_dim  * kernel_ratio)
+
+        self.w = np.random.random(size=(int(self.embed_dim * kernel_ratio), self.embed_dim))
+        # TODO: init with orthognal matrix
+        #self.w, _ = np.linalg.qr(self.w)
+
+        self.w = paddle.create_parameter(
+            shape=[int(self.embed_dim * kernel_ratio), self.embed_dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Assign(self.w / math.sqrt(self.m)))
+
+    # paddle version 2.1 does not support einsum
+    def prm_exp(self, x):
+        # x: [B, T, hs]
+        # w: [m, hs]
+        # return x: B, T, m
+        xd = (x * x).sum(axis=-1, keepdim=True)
+        xd = xd.expand([xd.shape[0], xd.shape[1], self.m]) / 2
+        # same as einsum('bti,mi->btm', x, self.w)
+        wtx = paddle.matmul(x, self.w, transpose_y=True)
+        out = paddle.exp(wtx - xd) / math.sqrt(self.m)
+        return out
+
+    def single_attention(self, x):
+        kqv = self.kqv(x).chunk(3, axis=-1)
+        k, q, v = kqv[0], kqv[1], kqv[2]
+
+        qp = self.prm_exp(q)
+        kp = self.prm_exp(k)
+
+        # same as einsum('bti,bi->bt, qp, kp.sum(axi=1).unsqueeze(2)')
+        D = paddle.matmul(qp, kp.sum(axis=1).unsqueeze(2))
+        # same as einsum('bti,bim->bnm')
+        kptv = paddle.matmul(v, kp, transpose_x=True)
+        # same as einsum('bti,bni->btn')
+        y = paddle.matmul(qp, kptv, transpose_y=True)
+        y = y / (D.expand([D.shape[0], D.shape[1], self.embed_dim]) + 1e-8)
+
+        # skip connection
+        y = self.proj(y)
+        y = self.dropout(y)
+        y = v + y
+        return y
+
+    def forward(self, x):
+        x = self.norm1(x)
+        x = self.single_attention(x)
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = h + x
+        return x
+
+
+class TokenTransformer(nn.Layer):
+    """ Token Transformer layers
+
+    Transformer layers contains regular self-attention layers,
+    mlp layers, norms layers and residual blocks. This module
+    is used in 'tokens-to-token', which converts image into tokens
+    and gradually tokenized the tokens.
+
+    Args:
+        dim: int, all heads dimension
+        in_dim: int, qkv and out dimension in attention
+        num_heads: int, num of heads
+        mlp_ratio: ratio to multiply on mlp input dim as mlp hidden dim, default: 1.
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, scale factor to replace dim_head ** -0.5, default: None
+        dropout: float, dropout rate for projection dropout, default: 0.
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        droppath: float, drop path rate, default: 0.
+    """
+    def __init__(self,
+                 dim,
+                 in_dim,
+                 num_heads,
+                 mlp_ratio=1.0,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0,
+                 droppath=0.):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.attn = Attention(dim,
+                              in_dim=in_dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              dropout=dropout,
+                              attention_dropout=attention_dropout,
+                              skip_connection=True)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+        self.norm2 = nn.LayerNorm(in_dim, epsilon=1e-6)
+        self.mlp = Mlp(in_features=in_dim,
+                       hidden_features=int(in_dim * mlp_ratio),
+                       out_features=in_dim,
+                       dropout=dropout)
+
+    def forward(self, x):
+        x = self.norm1(x)
+        x = self.attn(x)
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = h + x
+        return x
+
+
+class T2TViT(nn.Layer):
+    """ T2T-ViT model
+    Args:
+        image_size: int, input image size, default: 224
+        in_channels: int, input image channels, default: 3
+        num_classes: int, num of classes, default: 1000
+        token_type: string, type of token embedding ['performer', 'transformer'], default: 'performer'
+        embed_dim: int, dim of each patch after patch embedding, default: 768
+        depth: int, num of self-attention blocks, default: 12
+        num_heads: int, num of attention heads, default: 12
+        mlp_ratio: float, mlp hidden dim = mlp_ratio * mlp_in_dim, default: 4.
+        qkv_bias: bool, if True, qkv projection is set with bias, default: True
+        qk_scale: float, scale factor to replace dim_head ** -0.5, default: None
+        dropout: float, dropout rate for linear projections, default: 0.
+        attention_dropout: float, dropout rate for attention, default: 0.
+        droppath: float, drop path rate, default: 0.
+        token_dim: int, intermediate dim for patch_embedding module, default: 64
+    """
+    def __init__(self,
+                 image_size=224,
+                 in_channels=3,
+                 num_classes=1000,
+                 token_type='performer',
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0,
+                 token_dim=64):
+        super().__init__()
+        self.num_classes = num_classes
+        # convert image to paches: T2T-Module
+        self.patch_embed = PatchEmbedding(image_size=image_size,
+                                          token_type=token_type,
+                                          in_channels=in_channels,
+                                          embed_dim=embed_dim,
+                                          token_dim=token_dim)
+        num_patches = self.patch_embed.num_patches
+        # tokens add for classification
+        self.cls_token = paddle.create_parameter(
+            shape=[1, 1, embed_dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0.0))
+        # positional embeddings for patch positions
+        self.pos_embed = paddle.create_parameter(
+            shape=[1, num_patches + 1, embed_dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0.0))
+        # dropout for positional embeddings
+        self.pos_dropout = nn.Dropout(dropout)
+        # droppath deacay rate
+        depth_decay = paddle.linspace(0, droppath, depth)
+
+        # craete self-attention layers
+        layer_list = []
+        for i in range(depth):
+            block_layers = Block(dim=embed_dim,
+                                 num_heads=num_heads,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias,
+                                 qk_scale=qk_scale,
+                                 dropout=dropout,
+                                 attention_dropout=attention_dropout,
+                                 droppath=depth_decay[i])
+            layer_list.append(copy.deepcopy(block_layers))
+        self.blocks = nn.LayerList(layer_list)
+        self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
+        # classifier head
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else Identity()
+
+    def forward_features(self, x):
+        # Patch Embedding
+        x = self.patch_embed(x)
+
+        cls_tokens = self.cls_token.expand([x.shape[0], -1, -1])
+        x = paddle.concat([cls_tokens, x], axis=1)
+        x = x + self.pos_embed
+        x = self.pos_dropout(x)
+
+        # Self-Attention blocks
+        for block in self.blocks:
+            x = block(x)
+
+        x = self.norm(x)
+        return x[:, 0] # returns only cls_tokens
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def build_t2t_vit(config):
+    """build t2t-vit model using config"""
+    model = T2TViT(image_size=config.DATA.IMAGE_SIZE,
+                   token_type=config.MODEL.TRANS.TOKEN_TYPE,
+                   embed_dim=config.MODEL.TRANS.EMBED_DIM,
+                   depth=config.MODEL.TRANS.DEPTH,
+                   num_heads=config.MODEL.TRANS.NUM_HEADS,
+                   mlp_ratio=config.MODEL.TRANS.MLP_RATIO,
+                   qk_scale=config.MODEL.TRANS.QK_SCALE,
+                   qkv_bias=config.MODEL.TRANS.QKV_BIAS)
+    return model
diff --git a/image_classification/T2T_ViT/tests/__init__.py b/image_classification/T2T_ViT/tests/__init__.py
new file mode 100644
index 00000000..a6131c10
--- /dev/null
+++ b/image_classification/T2T_ViT/tests/__init__.py
@@ -0,0 +1 @@
+# init
diff --git a/image_classification/T2T_ViT/tests/test_token_performer.py b/image_classification/T2T_ViT/tests/test_token_performer.py
new file mode 100644
index 00000000..31f85dc1
--- /dev/null
+++ b/image_classification/T2T_ViT/tests/test_token_performer.py
@@ -0,0 +1,98 @@
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+import torch
+from t2t_vit import *
+
+
+class TokenPerformerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+    
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    def test_prm_exp_einsum(self):
+        x = np.random.randn(2, 3136, 64).astype('float32') 
+        w = np.random.randn(32, 64).astype('float32')
+        m = 32 
+        # pytorch 
+        x_pth = torch.Tensor(x)
+        w_pth = torch.Tensor(w)
+        xd_pth = (x_pth * x_pth).sum(dim=-1, keepdim=True).repeat(1, 1, m) / 2
+        #print(xd_pth)
+        wtx_pth = torch.einsum('bti,mi->btm', x_pth.float(), w_pth)
+        #print(wtx_pth)
+        out_pth = torch.exp(wtx_pth - xd_pth) / math.sqrt(m)
+        #print('-------------------')
+        # paddle
+        x_pd = paddle.to_tensor(x)
+        w_pd = paddle.to_tensor(w)
+        xd_pd = (x_pd * x_pd).sum(axis=-1, keepdim=True)
+        xd_pd = xd_pd.expand([xd_pd.shape[0], xd_pd.shape[1], m]) / 2
+        #print(xd_pd)
+        wtx_pd = paddle.matmul(x_pd, w_pd, transpose_y=True)
+        #print(wtx_pd)
+        out_pd = paddle.exp(wtx_pd - xd_pd) / math.sqrt(m)
+
+        # check if paddle out equals to pytorch out 
+        out_pth_np = out_pth.cpu().numpy()
+        out_pd_np = out_pd.cpu().numpy()
+        self.assertTrue(np.allclose(out_pth_np, out_pd_np, atol=1e-5))
+
+    def test_single_attention_einsum(self):
+        qp = np.random.randn(2, 3136, 32).astype('float32') 
+        kp = np.random.randn(2, 3136, 32).astype('float32') 
+        v = np.random.randn(2, 3136, 64).astype('float32') 
+        emb = 64 
+        # pytorch 
+        qp_pth = torch.Tensor(qp)
+        kp_pth = torch.Tensor(kp)
+        v_pth = torch.Tensor(v)
+        D_pth = torch.einsum('bti,bi->bt', qp_pth, kp_pth.sum(dim=1)).unsqueeze(dim=2)
+        #print(D_pth.shape)
+        #print('D_pth: ', D_pth)
+        kptv_pth = torch.einsum('bin,bim->bnm', v_pth.float(), kp_pth)
+        #print(kptv_pth)
+        y_pth = torch.einsum('bti,bni->btn', qp_pth, kptv_pth)
+        y_pth = y_pth / (D_pth.repeat(1, 1, emb) + 1e-3)
+        #print('y_pth = ', y_pth)
+
+        #print('-------------------')
+        # paddle
+        qp_pd = paddle.to_tensor(qp)
+        kp_pd = paddle.to_tensor(kp)
+        v_pd = paddle.to_tensor(v)
+        D_pd = paddle.matmul(qp_pd, kp_pd.sum(axis=1).unsqueeze(2)) 
+        #print(D_pd.shape)
+        #print('D_pd: ', D_pd)
+        kptv_pd = paddle.matmul(v_pd, kp_pd, transpose_x=True)
+        #print(kptv_pd)
+        y_pd = paddle.matmul(qp_pd, kptv_pd, transpose_y=True)
+        y_pd = y_pd / (D_pd.expand([D_pd.shape[0], D_pd.shape[1], emb]) + 1e-3)
+        #print('y_pd: ', y_pd)
+
+        # check if paddle out equals to pytorch out 
+        D_pth_np = D_pth.cpu().numpy()
+        D_pd_np = D_pd.cpu().numpy()
+        self.assertTrue(np.allclose(D_pth_np, D_pd_np, rtol=1e-2))
+        #print('D same')
+
+        kptv_pth_np = kptv_pth.cpu().numpy()
+        kptv_pd_np = kptv_pd.cpu().numpy()
+        self.assertTrue(np.allclose(kptv_pth_np, kptv_pd_np, rtol=1e-2))
+        #print('kptv same')
+
+        y_pth_np = y_pth.cpu().numpy()
+        y_pd_np = y_pd.cpu().numpy()
+        self.assertTrue(np.allclose(y_pth_np, y_pd_np, rtol=1e-2))
+        #print('y same')
+
+    #@unittest.skip('skip for debug')
+    def test_token_performer(self):
+        tp = TokenPerformer(96, 96)
+
+
diff --git a/image_classification/T2T_ViT/utils.py b/image_classification/T2T_ViT/utils.py
new file mode 100644
index 00000000..44800527
--- /dev/null
+++ b/image_classification/T2T_ViT/utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/image_classification/VOLO/README.md b/image_classification/VOLO/README.md
new file mode 100644
index 00000000..21fa2b12
--- /dev/null
+++ b/image_classification/VOLO/README.md
@@ -0,0 +1,164 @@
+# VOLO: Vision Outlooker for Visual Recognition, [arxiv](https://arxiv.org/abs/2103.17239) 
+
+PaddlePaddle training/validation code and pretrained models for **VOLO**.
+
+The official pytorch implementation is [here](https://github.com/sail-sg/volo).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./volo.png" alt="drawing" width="100%" height="100%"/>
+    <h4 align="center">VOLO Model Overview</h4>
+</p>
+
+### Update 
+Update (2021-08-11): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| volo_d5_224_86.10              | 86.08 | 97.58 | 224        | 1.0      | bicubic       | [google](https://drive.google.com/file/d/1GBOBPCBJYZfWybK-Xp0Otn0N4NXpct0G/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1t9gPLRAOkdXaG55fVADQZg)(td49) |
+| volo_d5_512_87.07              | 87.05 | 97.97 | 512        | 1.15     | bicubic       | [google](https://drive.google.com/file/d/1Phf_wHsjRZ1QrZ8oFrqsYuhDr4TXrVkc/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1X-WjpNqvWva2M977jgHosg)(irik) |
+
+> *The results are evaluated on ImageNet2012 validation set.
+
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./volo_d5_224.pdparams`, to use the `volo_d5_224` model in python:
+```python
+from config import get_config
+from volo import build_volo as build_model
+# config files in ./configs/
+config = get_config('./configs/volo_d5_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./volo_d5_224')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate VOLO model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/volo_d5_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./volo_d5_224'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/volo_d5_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./volo_d5_224'
+```
+
+</details>
+
+## Training
+To train the VOLO model on ImageNet2012 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/volo_d5_224.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=32 \
+  -data_path='/dataset/imagenet' \
+```
+
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/volo_d5_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@article{yuan2021volo,
+  title={Volo: Vision outlooker for visual recognition},
+  author={Yuan, Li and Hou, Qibin and Jiang, Zihang and Feng, Jiashi and Yan, Shuicheng},
+  journal={arXiv preprint arXiv:2106.13112},
+  year={2021}
+}
+```
diff --git a/image_classification/VOLO/config.py b/image_classification/VOLO/config.py
new file mode 100644
index 00000000..b40287e1
--- /dev/null
+++ b/image_classification/VOLO/config.py
@@ -0,0 +1,152 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+"""
+
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 256 #256 # train batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #64 # val batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size: 224 for pretrain, 384 for finetune
+_C.DATA.CROP_PCT = 0.875 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'VOLO'
+_C.MODEL.NAME = 'VOLO'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.1
+_C.MODEL.DROPPATH = 0.1
+_C.MODEL.ATTENTION_DROPOUT = 0.0
+_C.MODEL.STEM_HIDDEN_DIM = 128
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.PATCH_SIZE = 32
+_C.MODEL.TRANS.LAYERS = [12, 12, 20, 4]
+_C.MODEL.TRANS.EMBED_DIMS = [384, 768, 768, 768]
+_C.MODEL.TRANS.MLP_RATIOS = [4, 4, 4, 4]
+_C.MODEL.TRANS.DOWNSAMPLES = [True, False, False, False]
+_C.MODEL.TRANS.OUTLOOK_ATTENTION = [True, False, False, False]
+_C.MODEL.TRANS.NUM_HEADS = [12, 16, 16, 16]
+_C.MODEL.TRANS.QKV_BIAS = False
+_C.MODEL.TRANS.QK_SCALE = False
+
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size
+_C.TRAIN.WEIGHT_DECAY = 0.05 #0.3 # 0.0 for finetune
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0
+_C.TRAIN.END_LR = 5e-4
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 10 # freq to save chpt
+_C.REPORT_FREQ = 50 # freq to logging info
+_C.VALIDATE_FREQ = 100 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/VOLO/configs/volo_d5_224.yaml b/image_classification/VOLO/configs/volo_d5_224.yaml
new file mode 100644
index 00000000..02c5c138
--- /dev/null
+++ b/image_classification/VOLO/configs/volo_d5_224.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: volo
+    NAME: volo_d5_224
+    TRANS:
+        LAYERS: [12, 12, 20, 4]
+        EMBED_DIMS: [384, 768, 768, 768]
+        NUM_HEADS: [12, 16, 16, 16]
+        MLP_RATIOS: [4, 4, 4, 4]
+        DOWNSAMPLES: [True, False, False, False]
+        OUTLOOK_ATTENTION: [True, False, False, False]
+    STEM_HIDDEN_DIM: 128
diff --git a/image_classification/VOLO/configs/volo_d5_512.yaml b/image_classification/VOLO/configs/volo_d5_512.yaml
new file mode 100644
index 00000000..85ef3a98
--- /dev/null
+++ b/image_classification/VOLO/configs/volo_d5_512.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 512
+    CROP_PCT: 1.15
+MODEL:
+    TYPE: volo
+    NAME: volo_d5_224
+    TRANS:
+        LAYERS: [12, 12, 20, 4]
+        EMBED_DIMS: [384, 768, 768, 768]
+        NUM_HEADS: [12, 16, 16, 16]
+        MLP_RATIOS: [4, 4, 4, 4]
+        DOWNSAMPLES: [True, False, False, False]
+        OUTLOOK_ATTENTION: [True, False, False, False]
+    STEM_HIDDEN_DIM: 128
diff --git a/image_classification/VOLO/datasets.py b/image_classification/VOLO/datasets.py
new file mode 100644
index 00000000..eeb16f89
--- /dev/null
+++ b/image_classification/VOLO/datasets.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from PIL import Image
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = Image.open(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        # scale_size must be single int, which will resize the shorter side of image
+        transforms.Resize(scale_size, 'bicubic'),
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/VOLO/droppath.py b/image_classification/VOLO/droppath.py
new file mode 100644
index 00000000..25b8d5ff
--- /dev/null
+++ b/image_classification/VOLO/droppath.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import paddle
+import paddle.nn as nn
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def drop_path(self, inputs):
+        """drop path op
+        Args:
+            input: tensor with arbitrary shape
+            drop_prob: float number of drop path probability, default: 0.0
+            training: bool, if current mode is training, default: False
+        Returns:
+            output: output tensor after drop path
+        """
+        # if prob is 0 or eval mode, return original input
+        if self.drop_prob == 0. or not self.training:
+            return inputs
+        keep_prob = 1 - self.drop_prob
+        keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+        random_tensor = random_tensor.floor() # mask
+        output = inputs.divide(keep_prob) * random_tensor #divide is to keep same output expectation
+        return output
+
+    def forward(self, inputs):
+        return self.drop_path(inputs)
+
+
+#def main():
+#    tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32')
+#    dp = DropPath(0.5)
+#    out = dp(tmp)
+#    print(out)
+#
+#if __name__ == "__main__":
+#    main()
diff --git a/image_classification/VOLO/fold.py b/image_classification/VOLO/fold.py
new file mode 100644
index 00000000..f3c8b9e1
--- /dev/null
+++ b/image_classification/VOLO/fold.py
@@ -0,0 +1,106 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+Fold operation, which is usually equivalent to 'col2im' operation.
+Current paddle version (2.1) is not supported native Fold operation.
+This hack is based on for-loop, which may be optimized in the future.
+"""
+
+import numpy as np
+import paddle
+
+
+def fold(inputs, output_size, kernel_size, padding, stride):
+    """
+    Args:
+        x: Tensor, input tensor, only support 3D tensor, [Batch, C * kernel_size * kernel_size, L]
+        output_size, Tuple/List, contains the height and width of the output tensor, len = 2
+        kernel_size: int, kernel size
+        padding: int, num of pad around the input
+        stride: int, stride for sliding window
+    """
+
+    B, D, L = inputs.shape
+    H, W = output_size
+    C = int(D / (kernel_size * kernel_size))
+    out_h = (H + 2*padding -kernel_size) // stride + 1
+    out_w = (W + 2*padding -kernel_size) // stride + 1
+
+    inputs = inputs.reshape([B, C, kernel_size, kernel_size, out_h, out_w])
+    
+    img = paddle.zeros([B, C, H + 2 * padding + stride - 1, W + 2 * padding + stride -1], dtype=inputs.dtype)
+
+    for y in range(kernel_size):
+        y_max = y + stride * out_h
+        for x in range(kernel_size):
+            x_max = x + stride * out_w
+            img[:, :, y:y_max:stride, x:x_max:stride] += inputs[:, :, y, x, :, :]
+
+    return img[:, :, padding: H + padding, padding: W + padding]
+
+
+
+#def main():
+#    paddle.set_device('cpu')
+#    arr = [
+#        [1, 1, 1, 1, 2, 2, 2, 2],
+#        [1, 1, 1, 1, 2, 2, 2, 2],
+#        [1, 1, 1, 1, 2, 2, 2, 2],
+#        [1, 1, 1, 1, 2, 2, 2, 2],
+#        [3, 3, 3, 3, 4, 4, 4, 4],
+#        [3, 3, 3, 3, 4, 4, 4, 4],
+#        [3, 3, 3, 3, 4, 4, 4, 4],
+#        [3, 3, 3, 3, 4, 4, 4, 4],
+#    ]
+#    arr = np.array(arr)
+#    tmp = paddle.to_tensor(arr, dtype='float32')
+#    tmp = tmp.reshape([1, 1, 8, 8])
+#
+#    unfold = paddle.nn.Unfold(3, 1, 1)
+#    out = unfold(tmp)
+#
+#    for i in range(out.shape[-1]):
+#        row = out[:, :, i].astype('int8').numpy()
+#        print(row)
+#    out = fold(out, output_size=(8, 8), kernel_size=3, padding=1, stride=1)
+#    print(out)
+#
+#if __name__ == "__main__":
+#    main()
+#
+#
+## k=3, p=2, s=2
+##[[[4. , 2. , 4. , 2. , 8. , 4. , 8. , 4. ],
+##   2. , 1. , 2. , 1. , 4. , 2. , 4. , 2. ],
+##   4. , 2. , 4. , 2. , 8. , 4. , 8. , 4. ],
+##   2. , 1. , 2. , 1. , 4. , 2. , 4. , 2. ],
+##   12., 6. , 12., 6. , 16., 8. , 16., 8. ],
+##   6. , 3. , 6. , 3. , 8. , 4. , 8. , 4. ],
+##   12., 6. , 12., 6. , 16., 8. , 16., 8. ],
+##   6. , 3. , 6. , 3. , 8. , 4. , 8. , 4. ]]]])
+#
+#
+## k = 3, p=1, s=1
+## [[[[4. , 6. , 6. , 6. , 12., 12., 12., 8. ],
+##     [6. , 9. , 9. , 9. , 18., 18., 18., 12.],
+##     [6. , 9. , 9. , 9. , 18., 18., 18., 12.],
+##     [6. , 9. , 9. , 9. , 18., 18., 18., 12.],
+##     [18., 27., 27., 27., 36., 36., 36., 24.],
+##     [18., 27., 27., 27., 36., 36., 36., 24.],
+##     [18., 27., 27., 27., 36., 36., 36., 24.],
+##     [12., 18., 18., 18., 24., 24., 24., 16.]]]])
+##
+#
diff --git a/image_classification/VOLO/main_multi_gpu.py b/image_classification/VOLO/main_multi_gpu.py
new file mode 100644
index 00000000..2c0bb7c4
--- /dev/null
+++ b/image_classification/VOLO/main_multi_gpu.py
@@ -0,0 +1,362 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""VOLO training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader, get_dataset
+from volo import build_volo as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('VOLO')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # STEP 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # STEP 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # STEP 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # STEP 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # STEP 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # STEP 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # STEP 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # STEP 7. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/VOLO/main_single_gpu.py b/image_classification/VOLO/main_single_gpu.py
new file mode 100644
index 00000000..3bc0bb92
--- /dev/null
+++ b/image_classification/VOLO/main_single_gpu.py
@@ -0,0 +1,340 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""VOLO training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from volo import build_volo as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('VOLO')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.val:.4f} ({val_acc1_meter.avg:.4f}), " +
+                    f"Avg Acc@1: {val_acc5_meter.val:.4f} ({val_acc5_meter.avg:.4f})")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # STEP 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+
+    # STEP 1. Create model
+    model = build_model(config)
+
+    # STEP 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+
+    # STEP 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+
+    # STEP 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    # STEP 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # STEP 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams'), "Wrong PRETRAINED model name, note that file ext '.pdparams' is NOT needed!"
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+
+    # STEP 7. Start validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # STEP 8. Start training and validation
+    logging.info(f"----- Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  )
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/VOLO/port_weights/load_pytorch_weights_224.py b/image_classification/VOLO/port_weights/load_pytorch_weights_224.py
new file mode 100644
index 00000000..0f654e8c
--- /dev/null
+++ b/image_classification/VOLO/port_weights/load_pytorch_weights_224.py
@@ -0,0 +1,235 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from config import *
+from volo import *
+from pytorch.volo.models.volo import volo_d5
+from pytorch.volo.utils import load_pretrained_weights
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/volo_d5_224.yaml')
+parser.add_argument('-dataset', type=str, default="imagenet2012")
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+        ('patch_embed.proj', 'patch_embed.proj'),
+    ]
+    
+    # patch embedding:
+    th_prefix = 'patch_embed.conv'
+    pp_prefix = 'patch_embed.stem'
+    layer_mapping = [
+        (f'{th_prefix}.0.weight', f'{pp_prefix}.0.weight'),#conv
+        (f'{th_prefix}.1.weight', f'{pp_prefix}.1.weight'),#bn
+        (f'{th_prefix}.1.bias', f'{pp_prefix}.1.bias'),#bn
+        (f'{th_prefix}.1.running_mean', f'{pp_prefix}.1._mean'),#bn
+        (f'{th_prefix}.1.running_var', f'{pp_prefix}.1._variance'),#bn
+        (f'{th_prefix}.3.weight', f'{pp_prefix}.3.weight'),#conv
+        (f'{th_prefix}.4.weight', f'{pp_prefix}.4.weight'),#bn
+        (f'{th_prefix}.4.bias', f'{pp_prefix}.4.bias'),#bn
+        (f'{th_prefix}.4.running_mean', f'{pp_prefix}.4._mean'),#bn
+        (f'{th_prefix}.4.running_var', f'{pp_prefix}.4._variance'),#bn
+        (f'{th_prefix}.6.weight', f'{pp_prefix}.6.weight'),#conv
+        (f'{th_prefix}.7.weight', f'{pp_prefix}.7.weight'),#bn
+        (f'{th_prefix}.7.bias', f'{pp_prefix}.7.bias'),#bn
+        (f'{th_prefix}.7.running_mean', f'{pp_prefix}.7._mean'),#bn
+        (f'{th_prefix}.7.running_var', f'{pp_prefix}.7._variance'),#bn
+    ]
+    mapping.extend(layer_mapping)
+
+    # models
+    for idx, stage_idx in enumerate([0, 2, 3, 4]):
+        for layer_idx in range(config.MODEL.TRANS.LAYERS[idx]):
+            pp_prefix = f'model.{stage_idx}.{layer_idx}'
+            th_prefix = f'network.{stage_idx}.{layer_idx}'
+
+            if config.MODEL.TRANS.OUTLOOK_ATTENTION[idx]:
+                layer_mapping = [
+                    (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                    (f'{th_prefix}.attn.v.weight', f'{pp_prefix}.attn.v.weight'),
+                    (f'{th_prefix}.attn.attn', f'{pp_prefix}.attn.attn'),
+                    (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+                    (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                    (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'),
+                    (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'),
+                ]
+            else:
+                layer_mapping = [
+                    (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                    (f'{th_prefix}.attn.qkv.weight', f'{pp_prefix}.attn.qkv.weight'),
+                    (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+                    (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                    (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'),
+                    (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'),
+                ]
+            mapping.extend(layer_mapping)
+
+    layer_mapping = [
+        ('network.1.proj', 'model.1.proj'),
+    ]
+    mapping.extend(layer_mapping)
+    # Post layers
+    pp_prefix = f'post_model'
+    th_prefix = f'post_network'
+    for idx in range(2):
+        layer_mapping = [
+            (f'{th_prefix}.{idx}.norm1', f'{pp_prefix}.{idx}.norm1'),
+            (f'{th_prefix}.{idx}.attn.kv.weight', f'{pp_prefix}.{idx}.attn.kv.weight'),
+            (f'{th_prefix}.{idx}.attn.q.weight', f'{pp_prefix}.{idx}.attn.q.weight'),
+            (f'{th_prefix}.{idx}.attn.proj', f'{pp_prefix}.{idx}.attn.proj'),
+            (f'{th_prefix}.{idx}.norm2', f'{pp_prefix}.{idx}.norm2'),
+            (f'{th_prefix}.{idx}.mlp.fc1', f'{pp_prefix}.{idx}.mlp.fc1'),
+            (f'{th_prefix}.{idx}.mlp.fc2', f'{pp_prefix}.{idx}.mlp.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+    # Head layers
+    head_mapping = [
+        ('aux_head', 'aux_head'),
+        ('norm', 'norm'),
+        ('head', 'head')
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_volo(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    device = torch.device('cpu')
+    torch_model = volo_d5(img_size=config.DATA.IMAGE_SIZE) 
+    load_pretrained_weights(torch_model, './pytorch/volo/d5_224_86.10.pth.tar',
+        use_ema=False, strict=False, num_classes=1000)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('========================================================')
+    print('========================================================')
+    print('========================================================')
+    print('========================================================')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[1, 0:100])
+    print(out_paddle[1, 0:100])
+    assert np.allclose(out_torch[0], out_paddle[0], atol = 1e-3)
+    print('===== out 0 equal OK')
+    assert np.allclose(out_torch[1], out_paddle[1], atol = 1e-3)
+    print('===== out 1 equal OK')
+    
+    # save weights for paddle model
+    print('===== saving .pdparams')
+    model_path = os.path.join('./d5_512_87.07.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+    print('all done')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/VOLO/port_weights/load_pytorch_weights_512.py b/image_classification/VOLO/port_weights/load_pytorch_weights_512.py
new file mode 100644
index 00000000..9a5fd795
--- /dev/null
+++ b/image_classification/VOLO/port_weights/load_pytorch_weights_512.py
@@ -0,0 +1,244 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+from config import *
+from volo import *
+from pytorch.volo.models.volo import volo_d5
+from pytorch.volo.utils import load_pretrained_weights
+
+config = get_config()
+parser = argparse.ArgumentParser('')
+parser.add_argument('-cfg', type=str, default='./configs/volo_d5_512.yaml')
+parser.add_argument('-dataset', type=str, default="imagenet2012")
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-eval', action="store_true")
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+args = parser.parse_args()
+
+config = get_config()
+config = update_config(config, args)
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('cls_token', 'cls_token'),
+        ('pos_embed', 'pos_embed'),
+        ('patch_embed.proj', 'patch_embed.proj'),
+    ]
+    
+    # patch embedding:
+    th_prefix = 'patch_embed.conv'
+    pp_prefix = 'patch_embed.stem'
+    layer_mapping = [
+        (f'{th_prefix}.0.weight', f'{pp_prefix}.0.weight'),#conv
+        (f'{th_prefix}.1.weight', f'{pp_prefix}.1.weight'),#bn
+        (f'{th_prefix}.1.bias', f'{pp_prefix}.1.bias'),#bn
+        (f'{th_prefix}.1.running_mean', f'{pp_prefix}.1._mean'),#bn
+        (f'{th_prefix}.1.running_var', f'{pp_prefix}.1._variance'),#bn
+        (f'{th_prefix}.3.weight', f'{pp_prefix}.3.weight'),#conv
+        (f'{th_prefix}.4.weight', f'{pp_prefix}.4.weight'),#bn
+        (f'{th_prefix}.4.bias', f'{pp_prefix}.4.bias'),#bn
+        (f'{th_prefix}.4.running_mean', f'{pp_prefix}.4._mean'),#bn
+        (f'{th_prefix}.4.running_var', f'{pp_prefix}.4._variance'),#bn
+        (f'{th_prefix}.6.weight', f'{pp_prefix}.6.weight'),#conv
+        (f'{th_prefix}.7.weight', f'{pp_prefix}.7.weight'),#bn
+        (f'{th_prefix}.7.bias', f'{pp_prefix}.7.bias'),#bn
+        (f'{th_prefix}.7.running_mean', f'{pp_prefix}.7._mean'),#bn
+        (f'{th_prefix}.7.running_var', f'{pp_prefix}.7._variance'),#bn
+    ]
+    mapping.extend(layer_mapping)
+
+
+    # models
+    for idx, stage_idx in enumerate([0, 2, 3, 4]):
+        for layer_idx in range(config.MODEL.TRANS.LAYERS[idx]):
+            pp_prefix = f'model.{stage_idx}.{layer_idx}'
+            th_prefix = f'network.{stage_idx}.{layer_idx}'
+
+            if config.MODEL.TRANS.OUTLOOK_ATTENTION[idx]:
+                layer_mapping = [
+                    (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                    (f'{th_prefix}.attn.v.weight', f'{pp_prefix}.attn.v.weight'),
+                    (f'{th_prefix}.attn.attn', f'{pp_prefix}.attn.attn'),
+                    (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+                    (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                    (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'),
+                    (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'),
+                ]
+            else:
+                layer_mapping = [
+                    (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                    (f'{th_prefix}.attn.qkv.weight', f'{pp_prefix}.attn.qkv.weight'),
+                    (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.proj'),
+                    (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                    (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'),
+                    (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'),
+                ]
+            mapping.extend(layer_mapping)
+
+    layer_mapping = [
+        ('network.1.proj', 'model.1.proj'),
+    ]
+    mapping.extend(layer_mapping)
+    # Post layers
+    pp_prefix = f'post_model'
+    th_prefix = f'post_network'
+    for idx in range(2):
+        layer_mapping = [
+            (f'{th_prefix}.{idx}.norm1', f'{pp_prefix}.{idx}.norm1'),
+            (f'{th_prefix}.{idx}.attn.kv.weight', f'{pp_prefix}.{idx}.attn.kv.weight'),
+            (f'{th_prefix}.{idx}.attn.q.weight', f'{pp_prefix}.{idx}.attn.q.weight'),
+            (f'{th_prefix}.{idx}.attn.proj', f'{pp_prefix}.{idx}.attn.proj'),
+            (f'{th_prefix}.{idx}.norm2', f'{pp_prefix}.{idx}.norm2'),
+            (f'{th_prefix}.{idx}.mlp.fc1', f'{pp_prefix}.{idx}.mlp.fc1'),
+            (f'{th_prefix}.{idx}.mlp.fc2', f'{pp_prefix}.{idx}.mlp.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+    # Head layers
+    head_mapping = [
+        ('aux_head', 'aux_head'),
+        ('norm', 'norm'),
+        ('head', 'head')
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'set {th_name} {th_shape} to {pd_name} {pd_shape}')
+        value = th_params[th_name].data.numpy()
+        if len(value.shape) == 2:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            th_name_b = f'{th_name}.bias'
+            pd_name_b = f'{pd_name}.bias'
+            _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_volo(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print('----------------------------------')
+
+    device = torch.device('cpu')
+    torch_model = volo_d5(img_size=config.DATA.IMAGE_SIZE)  # NOTE: must add img_size
+    load_pretrained_weights(torch_model, './pytorch/volo/d5_512_87.07.pth.tar',
+        use_ema=False, strict=False, num_classes=1000)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    #x = np.random.randn(1, 3, 224, 224).astype('float32')
+    #x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x = np.random.randn(2, 3, 512, 512).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    print('========================================================')
+    print('========================================================')
+    print('========================================================')
+    print('========================================================')
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[1, 0:100])
+    print(out_paddle[1, 0:100])
+    assert np.allclose(out_torch[0], out_paddle[0], atol = 1e-3)
+    print('===== out 0 equal OK')
+    assert np.allclose(out_torch[1], out_paddle[1], atol = 1e-3)
+    print('===== out 1 equal OK')
+    #assert np.allclose(out_torch[2], out_paddle[2], atol = 1e-3)
+    #print('===== out 2 equal OK')
+    #assert np.allclose(out_torch[3], out_paddle[3], atol = 1e-3)
+    #print('===== out 3 equal OK')
+    
+    # save weights for paddle model
+    model_path = os.path.join('./d5_512_87.07.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/VOLO/run_eval.sh b/image_classification/VOLO/run_eval.sh
new file mode 100644
index 00000000..05193234
--- /dev/null
+++ b/image_classification/VOLO/run_eval.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/volo_d5_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=8 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./d5_224_86.10'
diff --git a/image_classification/VOLO/run_eval_512_multi.sh b/image_classification/VOLO/run_eval_512_multi.sh
new file mode 100644
index 00000000..52093b04
--- /dev/null
+++ b/image_classification/VOLO/run_eval_512_multi.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/volo_d5_512.yaml' \
+-dataset='imagenet2012' \
+-batch_size=4 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./d5_512_87.07' \
diff --git a/image_classification/VOLO/run_eval_multi.sh b/image_classification/VOLO/run_eval_multi.sh
new file mode 100644
index 00000000..cdca89a0
--- /dev/null
+++ b/image_classification/VOLO/run_eval_multi.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/volo_d5_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=8 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./d5_224_86.10' \
diff --git a/image_classification/VOLO/run_train.sh b/image_classification/VOLO/run_train.sh
new file mode 100644
index 00000000..f907e189
--- /dev/null
+++ b/image_classification/VOLO/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/volo_d5_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=8 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/VOLO/run_train_multi.sh b/image_classification/VOLO/run_train_multi.sh
new file mode 100644
index 00000000..906df3c1
--- /dev/null
+++ b/image_classification/VOLO/run_train_multi.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/volo_d5_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=8 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/VOLO/tests/__init__.py b/image_classification/VOLO/tests/__init__.py
new file mode 100644
index 00000000..e2cbd538
--- /dev/null
+++ b/image_classification/VOLO/tests/__init__.py
@@ -0,0 +1 @@
+#init
diff --git a/image_classification/VOLO/tests/test_fold.py b/image_classification/VOLO/tests/test_fold.py
new file mode 100644
index 00000000..f7cea792
--- /dev/null
+++ b/image_classification/VOLO/tests/test_fold.py
@@ -0,0 +1,75 @@
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+from fold import fold
+
+
+class FoldTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+    
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    #@unittest.skip('skip for debug')
+    def test_fold_1(self):
+        """test padding=2, stride=2"""
+        arr = [
+           [1, 1, 1, 1, 2, 2, 2, 2],
+           [1, 1, 1, 1, 2, 2, 2, 2],
+           [1, 1, 1, 1, 2, 2, 2, 2],
+           [1, 1, 1, 1, 2, 2, 2, 2],
+           [3, 3, 3, 3, 4, 4, 4, 4],
+           [3, 3, 3, 3, 4, 4, 4, 4],
+           [3, 3, 3, 3, 4, 4, 4, 4],
+           [3, 3, 3, 3, 4, 4, 4, 4],
+        ]
+        arr = np.array(arr)
+        tmp = paddle.to_tensor(arr, dtype='float32')
+        tmp = tmp.reshape([1, 1, 8, 8])
+    
+        unfold = paddle.nn.Unfold(3, 2, 2)
+        out = unfold(tmp)
+        out = fold(out, output_size=(8, 8), kernel_size=3, padding=2, stride=2)
+        ans = [[[[4. , 2. , 4. , 2. , 8. , 4. , 8. , 4. ],
+                 [2. , 1. , 2. , 1. , 4. , 2. , 4. , 2. ],
+                 [4. , 2. , 4. , 2. , 8. , 4. , 8. , 4. ],
+                 [2. , 1. , 2. , 1. , 4. , 2. , 4. , 2. ],
+                 [12., 6. , 12., 6. , 16., 8. , 16., 8. ],
+                 [6. , 3. , 6. , 3. , 8. , 4. , 8. , 4. ],
+                 [12., 6. , 12., 6. , 16., 8. , 16., 8. ],
+                 [6. , 3. , 6. , 3. , 8. , 4. , 8. , 4. ]]]]
+        self.assertTrue(np.allclose(np.array(ans), out.numpy()))
+
+    def test_fold_2(self):
+        """test padding=1, stride=1"""
+        arr = [
+           [1, 1, 1, 1, 2, 2, 2, 2],
+           [1, 1, 1, 1, 2, 2, 2, 2],
+           [1, 1, 1, 1, 2, 2, 2, 2],
+           [1, 1, 1, 1, 2, 2, 2, 2],
+           [3, 3, 3, 3, 4, 4, 4, 4],
+           [3, 3, 3, 3, 4, 4, 4, 4],
+           [3, 3, 3, 3, 4, 4, 4, 4],
+           [3, 3, 3, 3, 4, 4, 4, 4],
+        ]
+        arr = np.array(arr)
+        tmp = paddle.to_tensor(arr, dtype='float32')
+        tmp = tmp.reshape([1, 1, 8, 8])
+    
+        unfold = paddle.nn.Unfold(3, 1, 1)
+        out = unfold(tmp)
+        out = fold(out, output_size=(8, 8), kernel_size=3, padding=1, stride=1)
+        ans = [[[[4. , 6. , 6. , 6. , 12., 12., 12., 8. ],
+                 [6. , 9. , 9. , 9. , 18., 18., 18., 12.],
+                 [6. , 9. , 9. , 9. , 18., 18., 18., 12.],
+                 [6. , 9. , 9. , 9. , 18., 18., 18., 12.],
+                 [18., 27., 27., 27., 36., 36., 36., 24.],
+                 [18., 27., 27., 27., 36., 36., 36., 24.],
+                 [18., 27., 27., 27., 36., 36., 36., 24.],
+                 [12., 18., 18., 18., 24., 24., 24., 16.]]]]
+
+        self.assertTrue(np.allclose(np.array(ans), out.numpy()))
diff --git a/image_classification/VOLO/tests/test_volo.py b/image_classification/VOLO/tests/test_volo.py
new file mode 100644
index 00000000..5206dfe0
--- /dev/null
+++ b/image_classification/VOLO/tests/test_volo.py
@@ -0,0 +1,93 @@
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+from config import *
+from volo import *
+
+
+class VoloTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+        cls.config = get_config()
+        cls.dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        cls.dummy_tensor = paddle.to_tensor(cls.dummy_img)
+    
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    #@unittest.skip('skip for debug')
+    def test_identity(self):
+        layer = Identity()
+        out = layer(VoloTest.dummy_tensor)
+        self.assertTrue(np.allclose(out.numpy(), VoloTest.dummy_tensor.numpy()))
+
+    #@unittest.skip('skip for debug')
+    def test_downsample(self):
+        layer = Downsample(3, 16, 4)
+        tensor = paddle.randn(shape=[4, 256, 256, 3])
+        out = layer(tensor)
+        self.assertEqual([4, 64, 64, 16], out.shape)
+
+    def test_patchembedding(self):
+        layer = PatchEmbedding(stem_conv=True)
+        tensor = paddle.randn(shape=[4, 3, 224, 224])
+        out = layer(tensor)
+        self.assertEqual([4, 384, 28, 28], out.shape)
+
+    def test_mlp(self):
+        layer = Mlp(in_features=128, hidden_features=64, dropout=0.1)
+        tensor = paddle.randn(shape=[4, 128])
+        out = layer(tensor)
+        self.assertEqual([4, 128], out.shape)
+
+    def test_outlooker_attention(self):
+        layer = OutlookerAttention(dim=64,  num_heads=8)
+        tensor = paddle.randn(shape=[4, 32, 32, 64])
+        out = layer(tensor)
+        self.assertEqual([4, 32, 32, 64], out.shape)
+
+    def test_outlooker(self):
+        layer = Outlooker(dim=64, kernel_size=3, padding=1, num_heads=8)
+        tensor = paddle.randn(shape=[4, 32, 32, 64])
+        out = layer(tensor)
+        self.assertEqual([4, 32, 32, 64], out.shape)
+
+    def test_attention(self):
+        layer = Attention(dim=64, num_heads=8)
+        tensor = paddle.randn(shape=[4, 32, 32, 64])
+        out = layer(tensor)
+        self.assertEqual([4, 32, 32, 64], out.shape)
+
+    def test_transformer(self):
+        layer = Transformer(dim=64, num_heads=8)
+        tensor = paddle.randn(shape=[4, 32, 32, 64])
+        out = layer(tensor)
+        self.assertEqual([4, 32, 32, 64], out.shape)
+
+    def test_class_attention(self):
+        layer = ClassAttention(dim=64)
+        tensor = paddle.randn(shape=[4, 32, 64])
+        out = layer(tensor)
+        self.assertEqual([4, 1, 64], out.shape)
+
+    def test_class_block(self):
+        layer = ClassBlock(dim=64, num_heads=8)
+        tensor = paddle.randn(shape=[4, 32, 64])
+        out = layer(tensor)
+        self.assertEqual([4, 32, 64], out.shape)
+
+    @unittest.skip('skip for debug')
+    def test_build_model(self):
+        print(VoloTest.config)
+        model = build_volo(VoloTest.config)
+        print(model)
+
+    @unittest.skip('skip for debug')
+    def test_model_inference(self):
+        print(VoloTest.config)
+        model = build_volo(VoloTest.config)
+        print(model(VoloTest.dummy_tensor))
+
diff --git a/image_classification/VOLO/utils.py b/image_classification/VOLO/utils.py
new file mode 100644
index 00000000..da7c5169
--- /dev/null
+++ b/image_classification/VOLO/utils.py
@@ -0,0 +1,147 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+import numpy as np
+
+
+class MyPrint():
+    """" Print tensor and its shape, used for debug """
+    def __init__(self):
+        self.cnt = 0
+    def myprint(self, prefix, var, cnt=None, save=None):
+        """print tensor and its shape, optionly save to npy
+        Args:
+            prefix: str, print info in 1st and last lines
+            var: Tensor, tensor needs to print
+            cnt: int, if self.cnt is exceed this value, print will stop
+            save: str, file name (should end with .npy) to save the tensor, if None no save 
+        """
+        if cnt is None or self.cnt < cnt: 
+            print(f'------------ {prefix} ---------------')
+            print(var.shape, var)
+            print(f'------------ END {prefix} ---------------')
+            if save is not None:
+                var = var.numpy()
+                with open(save,'wb') as ofile:
+                    np.save(ofile, var)
+        self.cnt += 1
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.val = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.val = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.val = val
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/image_classification/VOLO/volo.png b/image_classification/VOLO/volo.png
new file mode 100644
index 00000000..6f2ac902
Binary files /dev/null and b/image_classification/VOLO/volo.png differ
diff --git a/image_classification/VOLO/volo.py b/image_classification/VOLO/volo.py
new file mode 100644
index 00000000..371ccf18
--- /dev/null
+++ b/image_classification/VOLO/volo.py
@@ -0,0 +1,793 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement VOLO Class
+"""
+
+import math
+import copy
+import numpy as np
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+from fold import fold
+#from utils import MyPrint
+#myprint = MyPrint()
+
+class Identity(nn.Layer):
+    """ Identity layer
+    
+    The output of this layer is the input without any change.
+    Use this layer to avoid using 'if' condition in forward methods
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return x
+
+
+class Downsample(nn.Layer):
+    """Apply a Conv2D with kernel size = patch_size and stride = patch_size
+    The shape of input tensor is [N, H, W, C], which will be transposed to
+    [N, C, H, W] and feed into Conv, finally the output is transposed back
+    to [N, H, W, C].
+
+    Args:
+        in_embed_dim: int, input feature dimension
+        out_embed_dim: int, output feature dimension
+        patch_size: kernel_size and stride
+    """
+    def __init__(self, in_embed_dim, out_embed_dim, patch_size):
+        super().__init__()
+        self.proj = nn.Conv2D(in_embed_dim,
+                              out_embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+
+    def forward(self, x):
+        x = x.transpose([0, 3, 1, 2])
+        x = self.proj(x)
+        x = x.transpose([0, 2, 3, 1])
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embeddings with stem conv layers
+
+    If stem conv layers are set, the image is firstly feed into stem layers,
+    stem layers contains 3 conv-bn-relu blocks.
+    Then a proj (conv2d) layer is applied as the patch embedding.
+
+    Args:
+        image_size: int, input image size, default: 224
+        stem_conv: bool, if apply stem conv layers, default: False
+        stem_stride: int, conv stride in stem layers, default: 1
+        patch_size: int, patch size for patch embedding (k and stride for proj conv), default: 8
+        in_channels: int, input channels, default: 3
+        hidden_dim: int, input dimension of patch embedding (out dim for stem), default: 64
+        embed_dim: int, output dimension of patch embedding, default: 384
+
+    """
+    def __init__(self,
+                 image_size=224,
+                 stem_conv=False,
+                 stem_stride=1,
+                 patch_size=8,
+                 in_channels=3,
+                 hidden_dim=64,
+                 embed_dim=384):
+        super().__init__()
+        assert patch_size in [4, 8, 16]
+        
+        # define stem conv layers
+        if stem_conv:
+            self.stem = nn.Sequential(
+                nn.Conv2D(in_channels,
+                          hidden_dim,
+                          kernel_size=7,
+                          stride=stem_stride,
+                          padding=3,
+                          bias_attr=False),
+                nn.BatchNorm2D(hidden_dim, momentum=0.9),
+                nn.ReLU(),
+                nn.Conv2D(hidden_dim,
+                          hidden_dim,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          bias_attr=False),
+                nn.BatchNorm2D(hidden_dim, momentum=0.9),
+                nn.ReLU(),
+                nn.Conv2D(hidden_dim,
+                          hidden_dim,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          bias_attr=False),
+                nn.BatchNorm2D(hidden_dim, momentum=0.9),
+                nn.ReLU(),
+            )
+        else:
+            self.stem = Identity()
+
+        # define patch embeddings
+        self.proj = nn.Conv2D(hidden_dim,
+                              embed_dim,
+                              kernel_size = patch_size // stem_stride,
+                              stride = patch_size // stem_stride)
+        # num patches
+        self.num_patches = (image_size // patch_size) * (image_size // patch_size)
+
+    def forward(self, x):
+        x = self.stem(x) # Identity layer if stem is not set
+        x = self.proj(x)
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+    
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+    
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+    
+    def __init__(self, in_features, hidden_features, dropout=0.):
+        super(Mlp, self).__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+    
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             in_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+    
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+    
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class OutlookerAttention(nn.Layer):
+    """ Outlooker Attention
+
+    Outlooker attention firstly applies a nn.Linear op, and unfold (im2col) the output
+    tensor, then use tensor reshape to get the 'V'. 'Attn' is obtained by pool, linear and reshape
+    ops applied on input tensor. Then a matmul is applied for 'V' and 'Attn'. Finally, a 
+    fold op is applied with a linear projection to get the output.
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        kernel_size: int, size used in fold/unfold, and pool, default: 3
+        padding: int, pad used in fold/unfold, default: 1
+        stride: int, stride used in fold/unfold, and pool, default: 1
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 kernel_size=3,
+                 padding=1,
+                 stride=1,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dim = dim
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head ** -0.5
+        
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.stride = stride
+
+        self.v = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.attn = nn.Linear(dim, (kernel_size ** 4) * num_heads)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+        self.pool = nn.AvgPool2D(kernel_size=stride, stride=stride, ceil_mode=True)
+
+        self.unfold = paddle.nn.Unfold(kernel_sizes=kernel_size, strides=self.stride, paddings=self.padding)
+
+    def forward(self, x):
+        B, H, W, C = x.shape
+
+        v = self.v(x) # B, H, W, C
+        v = v.transpose([0, 3, 1, 2]) # B, C, H, W
+
+        h, w = math.ceil(H / self.stride), math.ceil(W / self.stride)
+
+        # current paddle version has bugs using nn.Unfold 
+        v = paddle.nn.functional.unfold(v,
+                                        kernel_sizes=self.kernel_size,
+                                        paddings=self.padding,
+                                        strides=self.stride) # B, C*kernel_size*kernel_size, L(num of patches)
+
+        v = v.reshape([B, 
+                       self.num_heads,
+                       C // self.num_heads,
+                       self.kernel_size * self.kernel_size,
+                       h * w])
+        v = v.transpose([0, 1, 4, 3, 2])
+
+        x = x.transpose([0, 3, 1, 2])
+        attn = self.pool(x)
+        attn = attn.transpose([0, 2, 3, 1]) # B, H', W', C
+        attn = self.attn(attn)
+        attn = attn.reshape([B,
+                             h*w,
+                             self.num_heads,
+                             self.kernel_size * self.kernel_size,
+                             self.kernel_size * self.kernel_size])
+        attn = attn.transpose([0, 2, 1, 3, 4])
+ 
+        attn = attn * self.scale
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 1, 4, 3, 2])
+        new_shape = [B, C * self.kernel_size * self.kernel_size, h * w]
+        z = z.reshape(new_shape)
+
+        # Current Paddle dose not have Fold op, we hacked our fold op, see ./fold.py for details
+        z = fold(z, output_size=(H, W), kernel_size=self.kernel_size,
+                padding=self.padding, stride=self.stride)
+
+        z = z.transpose([0, 2, 3, 1])
+        z = self.proj(z)
+        z = self.proj_dropout(z)
+
+        return z
+
+
+class Outlooker(nn.Layer):
+    """ Outlooker
+    
+    Outlooker contains norm layers, outlooker attention, mlp and droppath layers,
+    and residual is applied during forward.
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        kernel_size: int, size used in fold/unfold, and pool, default: 3
+        padding: int, pad used in fold/unfold, default: 1
+        mlp_ratio: float, ratio to multiply with dim for mlp hidden feature dim, default: 3.
+        stride: int, stride used in fold/unfold, and pool, default: 1
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+    def __init__(self,
+                 dim,
+                 kernel_size,
+                 padding,
+                 stride=1,
+                 num_heads=1,
+                 mlp_ratio=3.,
+                 attention_dropout=0.,
+                 droppath=0.,
+                 qkv_bias=False,
+                 qk_scale=None):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = OutlookerAttention(dim,
+                                       num_heads,
+                                       kernel_size=kernel_size,
+                                       padding=padding,
+                                       stride=stride,
+                                       qkv_bias=qkv_bias,
+                                       qk_scale=qk_scale,
+                                       attention_dropout=attention_dropout)
+        self.drop_path = Droppath(droppath) if droppath > 0. else Identity()
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio))
+
+    def forward(self, x):
+        h = x
+        x = self.norm1(x)
+        x = self.attn(x)
+        x = self.drop_path(x)
+        x = h + x
+
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = h + x
+
+        return x
+
+
+class Attention(nn.Layer):
+    """ Attention
+
+    Regular Attention module same as ViT
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.softmax = nn.Softmax(axis=-1)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        B, H, W, C = x.shape
+
+        qkv = self.qkv(x)
+        qkv = qkv.reshape([B, H * W, 3, self.num_heads, C // self.num_heads])
+        qkv = qkv.transpose([2, 0, 3, 1, 4])
+
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = attn * self.scale
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 2, 1, 3])
+
+        z = z.reshape([B, H, W, C])
+        z = self.proj(z)
+        z = self.proj_dropout(z)
+
+        return z
+
+
+class Transformer(nn.Layer):
+    """Transformer
+
+    Transformer module, same as ViT
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        mlp_ratio: float, ratio to multiply with dim for mlp hidden feature dim, default: 4.
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0,
+                 droppath=0.):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = Attention(dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              attention_dropout=attention_dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio))
+    
+    def forward(self, x):
+        h = x
+        x = self.norm1(x)
+        x = self.attn(x)
+        x = self.drop_path(x)
+        x = h + x
+
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = h + x
+
+        return x
+
+
+class ClassAttention(nn.Layer):
+    """ Class Attention
+
+    Class Attention modlee same as CaiT
+
+    Args:
+        dim: int, all heads dimension
+        dim_head: int, single heads dimension, default: None
+        num_heads: int, num of heads
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 dim_head=None,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        if dim_head is not None:
+            self.dim_head = dim_head
+        else:
+            self.dim_head = dim // num_heads
+
+        self.scale = qk_scale or self.dim_head ** -0.5
+
+        self.kv = nn.Linear(dim,
+                            self.dim_head * self.num_heads * 2,
+                            bias_attr=qkv_bias)
+        self.q = nn.Linear(dim,
+                           self.dim_head * self.num_heads,
+                           bias_attr=qkv_bias)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.proj = nn.Linear(self.dim_head * self.num_heads, dim)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        kv = self.kv(x)
+        kv = kv.reshape([B, N, 2, self.num_heads, self.dim_head])
+        kv = kv.transpose([2, 0, 3, 1, 4])
+        k, v = kv[0], kv[1]
+
+        q = self.q(x[:, :1, :])
+        q = q.reshape([B, self.num_heads, 1, self.dim_head])
+        attn = paddle.matmul(q * self.scale, k, transpose_y=True)
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        cls_embed = paddle.matmul(attn, v)
+        cls_embed = cls_embed.transpose([0, 2, 1, 3])
+        cls_embed = cls_embed.reshape([B, 1, self.dim_head * self.num_heads])
+        cls_embed = self.proj(cls_embed)
+        cls_embed = self.proj_dropout(cls_embed)
+        return cls_embed
+    
+
+class ClassBlock(nn.Layer):
+    """Class Attention Block (CaiT)
+
+    CaiT module
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        mlp_ratio: float, ratio to multiply with dim for mlp hidden feature dim, default: 4.
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 dim_head=None,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = ClassAttention(dim,
+                                   num_heads=num_heads,
+                                   dim_head=dim_head,
+                                   qkv_bias=qkv_bias,
+                                   qk_scale=qk_scale,
+                                   attention_dropout=attention_dropout,
+                                   dropout=dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       dropout=dropout)
+
+    def forward(self, x):
+        cls_embed = x[:, :1]
+
+        h = self.norm1(x)
+        h = self.attn(h)
+        h = self.drop_path(h)
+        cls_embed = cls_embed + h
+        h = cls_embed
+        cls_embed = self.norm2(cls_embed)
+        cls_embed = self.mlp(cls_embed)
+        cls_embed = self.drop_path(cls_embed)
+        cls_embed = h + cls_embed
+        out = paddle.concat([cls_embed, x[:, 1:]], axis=1)
+
+        return out
+
+
+def rand_bbox(size, lam, scale=1):
+    """
+    get bounding box as token labeling (https://github.com/zihangJiang/TokenLabeling)
+    return: bounding box
+    """
+    W = size[1] // scale
+    H = size[2] // scale
+    cut_rat = np.sqrt(1. - lam)
+    cut_w = np.int(W * cut_rat)
+    cut_h = np.int(H * cut_rat)
+
+    # uniform
+    cx = np.random.randint(W)
+    cy = np.random.randint(H)
+
+    bbx1 = np.clip(cx - cut_w // 2, 0, W)
+    bby1 = np.clip(cy - cut_h // 2, 0, H)
+    bbx2 = np.clip(cx + cut_w // 2, 0, W)
+    bby2 = np.clip(cy + cut_h // 2, 0, H)
+    # item() get the python native dtype
+    return bbx1.item(), bby1.item(), bbx2.item(), bby2.item()
+
+
+class VOLO(nn.Layer):
+    def __init__(self,
+               layers,
+               image_size=224,
+               in_channels=3,
+               num_classes=1000,
+               patch_size=8,
+               stem_hidden_dim=64,
+               embed_dims=None,
+               num_heads=None,
+               downsamples=None,
+               outlook_attention=None,
+               mlp_ratios=None,
+               qkv_bias=False,
+               qk_scale=None,
+               dropout=0.,
+               attention_dropout=0.,
+               droppath=0.,
+               num_post_layers=2,
+               return_mean=False,
+               return_dense=True,
+               mix_token=True,
+               pooling_scale=2,
+               out_kernel=3,
+               out_stride=2,
+               out_padding=1):
+        super().__init__()
+        self.num_classes = num_classes
+        self.patch_embed = PatchEmbedding(image_size=image_size,
+                                         stem_conv=True,
+                                         stem_stride=2,
+                                         patch_size=patch_size,
+                                         in_channels=in_channels,
+                                         hidden_dim=stem_hidden_dim,
+                                         embed_dim=embed_dims[0])
+        self.pos_embed = paddle.create_parameter(
+            shape=[1,
+                   image_size // patch_size // pooling_scale,
+                   image_size // patch_size // pooling_scale,
+                   embed_dims[-1]],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0.0))
+
+        self.pos_dropout = nn.Dropout(dropout)
+
+        layer_list = []
+        for i in range(len(layers)):
+            blocks = []
+            for block_idx in range(layers[i]):
+                block_droppath = droppath * (
+                    block_idx + sum(layers[:i])) / (sum(layers) - 1)
+                if outlook_attention[i]:
+                    blocks.append(
+                        copy.deepcopy(
+                        Outlooker(dim=embed_dims[i],
+                                  kernel_size=out_kernel,
+                                  padding=out_padding,
+                                  stride=out_stride,
+                                  num_heads=num_heads[i],
+                                  mlp_ratio=mlp_ratios[i],
+                                  qkv_bias=qkv_bias,
+                                  qk_scale=qk_scale,
+                                  attention_dropout=attention_dropout,
+                                  droppath=block_droppath)))
+                else:
+                    blocks.append(
+                        copy.deepcopy(
+                        Transformer(dim=embed_dims[i],
+                                    num_heads=num_heads[i],
+                                    mlp_ratio=mlp_ratios[i],
+                                    qkv_bias=qkv_bias,
+                                    qk_scale=qk_scale,
+                                    attention_dropout=attention_dropout,
+                                    droppath=block_droppath))
+                        )
+            stage = nn.Sequential(*blocks)
+            layer_list.append(stage)
+
+            if downsamples[i]:
+                layer_list.append(copy.deepcopy(Downsample(embed_dims[i], embed_dims[i + 1], 2)))
+
+        self.model = nn.LayerList(layer_list)
+
+
+        # POST Layers (from CaiT)
+        self.post_model = None
+        if num_post_layers is not None:
+            self.post_model = nn.LayerList([
+                copy.deepcopy(
+                    ClassBlock(dim=embed_dims[-1],
+                               num_heads=num_heads[-1],
+                               mlp_ratio=mlp_ratios[-1],
+                               qkv_bias=qkv_bias,
+                               qk_scale=qk_scale,
+                               attention_dropout=attention_dropout,
+                               droppath=0.)
+                ) for i in range(num_post_layers)
+            ])
+            self.cls_token = paddle.create_parameter(
+                shape=[1, 1, embed_dims[-1]],
+                dtype='float32',
+                default_initializer=nn.initializer.TruncatedNormal(std=.02))
+
+        # Output
+        self.return_mean = return_mean # if True, return mean, not use class token
+        self.return_dense = return_dense # if True, return class token and all feature tokens
+        if return_dense:
+            assert not return_mean, "Cannot return both mean and dense"
+        self.mix_token = mix_token
+        self.pooling_scale = pooling_scale
+        if mix_token:
+            self.beta = 1.0
+            assert return_dense, 'return all tokens if mix_token is enabled'
+        if return_dense:
+            self.aux_head = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else Identity()
+        self.norm = nn.LayerNorm(embed_dims[-1])
+
+        self.head = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else Identity()
+
+        # For training:
+        # TODO: set pos_embed, trunc_normal
+        # TODO: set init weights for linear layers and layernorm layers
+        # TODO: set no weight decay for pos_embed and cls_token
+
+
+    def forward(self, x):
+        # Step1: patch embedding
+        x = self.patch_embed(x)
+        x = x.transpose([0, 2, 3, 1])
+        if self.mix_token and self.training:
+            lam = np.random.beta(self.beta, self.beta)
+            patch_h = x.shape[1] // self.pooling_scale
+            patch_w = x.shape[2] // self.pooling_scale
+            bbx1, bby1, bbx2, bby2 = rand_bbox(x.shape, lam, scale=self.pooling_scale)
+            temp_x = x.clone()
+            sbbx1 = self.pooling_scale * bbx1
+            sbby1 = self.pooling_scale * bby1
+            sbbx2 = self.pooling_scale * bbx2
+            sbby2 = self.pooling_scale * bby2
+            temp_x[:, sbbx1: sbbx2, sbby1: sbby2, :] = x.flip(axis=[0])[:, sbbx1: sbbx2, sbby1: sbby2, :]
+            x = temp_x
+        else:
+            bbx1, bby1, bbx2, bby2 = 0, 0, 0, 0
+        # Step2: 2-stages tokens learning
+        for idx, block in enumerate(self.model):
+            if idx == 2: # add pos_embed after outlooker blocks (and a downsample layer)
+                x = x + self.pos_embed
+                x = self.pos_dropout(x)
+            x = block(x)
+
+        x = x.reshape([x.shape[0], -1, x.shape[-1]]) # B, H*W, C
+        # Step3: post layers (from CaiT)
+        if self.post_model is not None:
+            cls_token = self.cls_token.expand([x.shape[0], -1, -1])
+            x = paddle.concat([cls_token, x], axis=1)
+            for block in self.post_model:
+                x = block(x)
+        x = self.norm(x)
+
+        if self.return_mean:
+            return self.head(x.mean(1))
+
+        x_cls = self.head(x[:, 0])
+        if not self.return_dense:
+            return x_cls
+
+        x_aux = self.aux_head(x[:, 1:])
+
+        if not self.training:
+            #NOTE: pytorch Tensor.max() returns a tuple of Tensor: (values, indices), while
+            #      paddle Tensor.max() returns a single Tensor: values
+            return x_cls + 0.5 * x_aux.max(1)
+
+        if self.mix_token and self.training:
+            x_aux = x_aux.reshape([x_aux.shape[0], patch_h, patch_w, x_aux.shape[-1]])
+            temp_x = x_aux.clone()
+            temp_x[:, bbx1:bbx2, bby1:bby2, :] = x_aux.flip(axis=[0])[:, bbx1:bbx2, bby1:bby2, :]
+            x_aux = temp_x
+            x_aux = x_aux.reshape([x_aux.shape[0], patch_h*patch_w, x_aux.shape[-1]])
+
+        return x_cls, x_aux, (bbx1, bby1, bbx2, bby2)
+
+        
+
+def build_volo(config):
+    """build volo model using config"""
+    model = VOLO(image_size=config.DATA.IMAGE_SIZE,
+                 layers=config.MODEL.TRANS.LAYERS,
+                 embed_dims=config.MODEL.TRANS.EMBED_DIMS,
+                 mlp_ratios=config.MODEL.TRANS.MLP_RATIOS,
+                 downsamples=config.MODEL.TRANS.DOWNSAMPLES,
+                 outlook_attention=config.MODEL.TRANS.OUTLOOK_ATTENTION,
+                 stem_hidden_dim=config.MODEL.STEM_HIDDEN_DIM,
+                 num_heads=config.MODEL.TRANS.NUM_HEADS,
+                 qkv_bias=config.MODEL.TRANS.QKV_BIAS,
+                 qk_scale=config.MODEL.TRANS.QK_SCALE)
+    return model
diff --git a/image_classification/ViT/README.md b/image_classification/ViT/README.md
new file mode 100644
index 00000000..76dab359
--- /dev/null
+++ b/image_classification/ViT/README.md
@@ -0,0 +1,167 @@
+# An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale, [arxiv](https://arxiv.org/abs/2010.11929) 
+
+PaddlePaddle training/validation code and pretrained models for **ViT**.
+
+The official TF implementation is [here](https://github.com/google-research/vision_transformer).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./vit.png" alt="drawing" width="90%"/>
+<h4 align="center">ViT Model Overview</h4>
+</p>
+
+
+### Update 
+Update (2021-08-11): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| vit_base_patch16_224           | 84.58 | 97.30 | 224        | 0.875    | bicubic      | [google](https://drive.google.com/file/d/13D9FqU4ISsGxWXURgKW9eLOBV-pYPr-L/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1ms3o2fHMQpIoVqnEHitRtA)(qv4n) |
+| vit_base_patch16_384           | 85.99 | 98.00 | 384        | 1.0      | bicubic      | [google](https://drive.google.com/file/d/1kWKaAgneDx0QsECxtf7EnUdUZej6vSFT/view?usp=sharing)/[baidu](https://pan.baidu.com/s/15ggLdiL98RPcz__SXorrXA)(wsum) |
+| vit_large_patch16_224          | 85.81 | 97.82 | 224        | 0.875    | bicubic       | [google](https://drive.google.com/file/d/1jgwtmtp_cDWEhZE-FuWhs7lCdpqhAMft/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1HRxUJAwEiKgrWnJSjHyU0A)(1bgk) |
+
+> *The results are evaluated on ImageNet2012 validation set.
+
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./vit_base_patch16_224.pdparams`, to use the `vit_base_patch16_224` model in python:
+```python
+from config import get_config
+from visual_transformer import build_vit as build_model
+# config files in ./configs/
+config = get_config('./configs/vit_base_patch16_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./vit_base_patch16_224')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate ViT model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/vit_base_patch16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./vit_base_patch16_224'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/vit_base_patch16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./vit_base_patch16_224'
+```
+
+</details>
+
+
+## Training
+To train the ViT model on ImageNet2012 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/vit_base_patch16_224.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=32 \
+  -data_path='/dataset/imagenet' \
+```
+
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/vit_base_patch16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@article{dosovitskiy2020image,
+  title={An image is worth 16x16 words: Transformers for image recognition at scale},
+  author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and others},
+  journal={arXiv preprint arXiv:2010.11929},
+  year={2020}
+}
+```
diff --git a/image_classification/ViT/config.py b/image_classification/ViT/config.py
new file mode 100644
index 00000000..aed498b0
--- /dev/null
+++ b/image_classification/ViT/config.py
@@ -0,0 +1,147 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+
+"""
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 256 #256 # train batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #64 # val batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size: 224 for pretrain, 384 for finetune
+_C.DATA.CROP_PCT = 0.875 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads 
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'ViT'
+_C.MODEL.NAME = 'ViT'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.1
+_C.MODEL.DROPPATH = 0.1
+_C.MODEL.ATTENTION_DROPOUT = 0.1
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.PATCH_SIZE = 32
+_C.MODEL.TRANS.EMBED_DIM = 768
+_C.MODEL.TRANS.MLP_RATIO= 4.0
+_C.MODEL.TRANS.NUM_HEADS = 12
+_C.MODEL.TRANS.DEPTH = 12
+_C.MODEL.TRANS.QKV_BIAS = True
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size
+_C.TRAIN.WEIGHT_DECAY = 0.05 #0.3 # 0.0 for finetune
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0
+_C.TRAIN.END_LR = 5e-4
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 10 # freq to save chpt
+_C.REPORT_FREQ = 100 # freq to logging info
+_C.VALIDATE_FREQ = 100 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.MODEL.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/ViT/configs/vit_base_patch16_224.yaml b/image_classification/ViT/configs/vit_base_patch16_224.yaml
new file mode 100644
index 00000000..eff0fc29
--- /dev/null
+++ b/image_classification/ViT/configs/vit_base_patch16_224.yaml
@@ -0,0 +1,21 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: ViT
+    NAME: vit_base_patch16_224
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 768
+        MLP_RATIO: 4.0
+        DEPTH: 12
+        NUM_HEADS: 12
+        QKV_BIAS: true
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
diff --git a/image_classification/ViT/configs/vit_base_patch16_384.yaml b/image_classification/ViT/configs/vit_base_patch16_384.yaml
new file mode 100644
index 00000000..04cdfaee
--- /dev/null
+++ b/image_classification/ViT/configs/vit_base_patch16_384.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: ViT
+    NAME: vit_base_patch16_384
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 768
+        MLP_RATIO: 4.0
+        DEPTH: 12
+        NUM_HEADS: 12
+        QKV_BIAS: true
+
diff --git a/image_classification/ViT/configs/vit_large_patch16_224.yaml b/image_classification/ViT/configs/vit_large_patch16_224.yaml
new file mode 100644
index 00000000..23ac9b37
--- /dev/null
+++ b/image_classification/ViT/configs/vit_large_patch16_224.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: ViT
+    NAME: vit_large_patch16_224
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 1024
+        MLP_RATIO: 4.0
+        DEPTH: 24
+        NUM_HEADS: 16
+        QKV_BIAS: true
+
diff --git a/image_classification/ViT/datasets.py b/image_classification/ViT/datasets.py
new file mode 100644
index 00000000..e207f9ba
--- /dev/null
+++ b/image_classification/ViT/datasets.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/ViT/droppath.py b/image_classification/ViT/droppath.py
new file mode 100644
index 00000000..25b8d5ff
--- /dev/null
+++ b/image_classification/ViT/droppath.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import paddle
+import paddle.nn as nn
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def drop_path(self, inputs):
+        """drop path op
+        Args:
+            input: tensor with arbitrary shape
+            drop_prob: float number of drop path probability, default: 0.0
+            training: bool, if current mode is training, default: False
+        Returns:
+            output: output tensor after drop path
+        """
+        # if prob is 0 or eval mode, return original input
+        if self.drop_prob == 0. or not self.training:
+            return inputs
+        keep_prob = 1 - self.drop_prob
+        keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+        random_tensor = random_tensor.floor() # mask
+        output = inputs.divide(keep_prob) * random_tensor #divide is to keep same output expectation
+        return output
+
+    def forward(self, inputs):
+        return self.drop_path(inputs)
+
+
+#def main():
+#    tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32')
+#    dp = DropPath(0.5)
+#    out = dp(tmp)
+#    print(out)
+#
+#if __name__ == "__main__":
+#    main()
diff --git a/image_classification/ViT/main_multi_gpu.py b/image_classification/ViT/main_multi_gpu.py
new file mode 100644
index 00000000..496b5957
--- /dev/null
+++ b/image_classification/ViT/main_multi_gpu.py
@@ -0,0 +1,362 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ViT training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader, get_dataset
+from transformer import build_vit as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('ViT')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, "+
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/ViT/main_single_gpu.py b/image_classification/ViT/main_single_gpu.py
new file mode 100644
index 00000000..ee7e6e1f
--- /dev/null
+++ b/image_classification/ViT/main_single_gpu.py
@@ -0,0 +1,333 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ViT training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from transformer import build_vit as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('ViT')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+    # 1. Create model
+    model = build_model(config)
+    #model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  )
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/ViT/port_weights/load_pytorch_weights.py b/image_classification/ViT/port_weights/load_pytorch_weights.py
new file mode 100644
index 00000000..ffe1902c
--- /dev/null
+++ b/image_classification/ViT/port_weights/load_pytorch_weights.py
@@ -0,0 +1,165 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from transformer import build_vit
+from config import *
+
+
+config = get_config('./configs/vit_base_patch16_224.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    prefix = 'patch_embedding'
+    mapping = [
+        ('cls_token', f'{prefix}.cls_token'),
+        ('pos_embed', f'{prefix}.position_embeddings'),
+        ('patch_embed.proj', f'{prefix}.patch_embedding'),
+    ]
+
+    num_layers = 12
+    for idx in range(num_layers):
+        pp_prefix = f'encoder.layers.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.attn_norm'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.mlp_norm'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.out'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'encoder.encoder_norm'),
+        ('head', 'classifier')
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'**SET** {th_name} {th_shape} **TO** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_vit(config)
+    paddle_model.eval()
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('vit_base_patch16_224', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print('========================================================')
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./vit_base_patch16_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+    print('all done')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/ViT/port_weights/load_pytorch_weights_384.py b/image_classification/ViT/port_weights/load_pytorch_weights_384.py
new file mode 100644
index 00000000..3f06c643
--- /dev/null
+++ b/image_classification/ViT/port_weights/load_pytorch_weights_384.py
@@ -0,0 +1,165 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from transformer import build_vit
+from config import *
+
+
+config = get_config('./configs/vit_base_patch16_384.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    prefix = 'patch_embedding'
+    mapping = [
+        ('cls_token', f'{prefix}.cls_token'),
+        ('pos_embed', f'{prefix}.position_embeddings'),
+        ('patch_embed.proj', f'{prefix}.patch_embedding'),
+    ]
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        pp_prefix = f'encoder.layers.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.attn_norm'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.mlp_norm'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.out'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'encoder.encoder_norm'),
+        ('head', 'classifier')
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'**SET** {th_name} {th_shape} **TO** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_vit(config)
+    paddle_model.eval()
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('vit_base_patch16_384', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 384, 384).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print('========================================================')
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./vit_base_patch16_384.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+    print('all done')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/ViT/port_weights/load_pytorch_weights_large.py b/image_classification/ViT/port_weights/load_pytorch_weights_large.py
new file mode 100644
index 00000000..ddbd6d81
--- /dev/null
+++ b/image_classification/ViT/port_weights/load_pytorch_weights_large.py
@@ -0,0 +1,165 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from transformer import build_vit
+from config import *
+
+
+config = get_config('./configs/vit_large_patch16_224.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    prefix = 'patch_embedding'
+    mapping = [
+        ('cls_token', f'{prefix}.cls_token'),
+        ('pos_embed', f'{prefix}.position_embeddings'),
+        ('patch_embed.proj', f'{prefix}.patch_embedding'),
+    ]
+
+    num_layers = config.MODEL.TRANS.DEPTH
+    for idx in range(num_layers):
+        pp_prefix = f'encoder.layers.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm1', f'{pp_prefix}.attn_norm'),
+            (f'{th_prefix}.norm2', f'{pp_prefix}.mlp_norm'),
+            (f'{th_prefix}.mlp.fc1', f'{pp_prefix}.mlp.fc1'), 
+            (f'{th_prefix}.mlp.fc2', f'{pp_prefix}.mlp.fc2'), 
+            (f'{th_prefix}.attn.qkv', f'{pp_prefix}.attn.qkv'),
+            (f'{th_prefix}.attn.proj', f'{pp_prefix}.attn.out'),
+        ]
+        mapping.extend(layer_mapping)
+
+    head_mapping = [
+        ('norm', 'encoder.encoder_norm'),
+        ('head', 'classifier')
+    ]
+    mapping.extend(head_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'**SET** {th_name} {th_shape} **TO** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_vit(config)
+    paddle_model.eval()
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('vit_large_patch16_224', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0, 0:100])
+    print('========================================================')
+    print(out_paddle[0, 0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./vit_large_patch16_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+    print('all done')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/ViT/run_eval.sh b/image_classification/ViT/run_eval.sh
new file mode 100644
index 00000000..51041d55
--- /dev/null
+++ b/image_classification/ViT/run_eval.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/vit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./vit_base_patch16_224'
diff --git a/image_classification/ViT/run_eval_base_224.sh b/image_classification/ViT/run_eval_base_224.sh
new file mode 100644
index 00000000..51041d55
--- /dev/null
+++ b/image_classification/ViT/run_eval_base_224.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/vit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=128 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./vit_base_patch16_224'
diff --git a/image_classification/ViT/run_eval_multi.sh b/image_classification/ViT/run_eval_multi.sh
new file mode 100644
index 00000000..efd9c34e
--- /dev/null
+++ b/image_classification/ViT/run_eval_multi.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/vit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./vit_base_patch16_224' \
diff --git a/image_classification/ViT/run_eval_multi_384.sh b/image_classification/ViT/run_eval_multi_384.sh
new file mode 100644
index 00000000..0a771aa8
--- /dev/null
+++ b/image_classification/ViT/run_eval_multi_384.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/vit_base_patch16_384.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./vit_base_patch16_384'
diff --git a/image_classification/ViT/run_eval_multi_large.sh b/image_classification/ViT/run_eval_multi_large.sh
new file mode 100644
index 00000000..eb839a13
--- /dev/null
+++ b/image_classification/ViT/run_eval_multi_large.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/vit_large_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=16 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./vit_large_patch16_224'
diff --git a/image_classification/ViT/run_train.sh b/image_classification/ViT/run_train.sh
new file mode 100644
index 00000000..cfb5f0b1
--- /dev/null
+++ b/image_classification/ViT/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+-cfg='./configs/vit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=8 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/ViT/run_train_multi.sh b/image_classification/ViT/run_train_multi.sh
new file mode 100644
index 00000000..93488e42
--- /dev/null
+++ b/image_classification/ViT/run_train_multi.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+-cfg='./configs/vit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=8 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/ViT/tests/__init__.py b/image_classification/ViT/tests/__init__.py
new file mode 100644
index 00000000..84952a81
--- /dev/null
+++ b/image_classification/ViT/tests/__init__.py
@@ -0,0 +1 @@
+# init
\ No newline at end of file
diff --git a/image_classification/ViT/tests/test_config.py b/image_classification/ViT/tests/test_config.py
new file mode 100644
index 00000000..6806e8a1
--- /dev/null
+++ b/image_classification/ViT/tests/test_config.py
@@ -0,0 +1,72 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import argparse
+from config import update_config, get_config
+
+class ConfigTest(unittest.TestCase):
+    def setUp(self):
+        parser = argparse.ArgumentParser('')
+        parser.add_argument('-cfg', type=str, default=None)
+        parser.add_argument('-dataset', type=str, default="cifar10")
+        parser.add_argument('-batch_size', type=int, default=128)
+        parser.add_argument('-image_size', type=int, default=256)
+        parser.add_argument('-ngpus', type=int, default=None)
+        parser.add_argument('-data_path', type=str, default='/cifar10/')
+        parser.add_argument('-eval', action='store_false') # enable eval
+        parser.add_argument('-pretrained', type=str, default='pretrained')
+        parser.add_argument('-resume', type=str, default=None)
+        parser.add_argument('-last_epoch', type=int, default=None)
+        self.args = parser.parse_args()
+
+    def tearDown(self):
+        pass
+
+    def test_update_config(self):
+        config = get_config()
+        config = update_config(config, self.args)
+
+        self.assertEqual(config.DATA.DATASET, 'cifar10')
+        self.assertEqual(config.DATA.BATCH_SIZE, 128)
+        self.assertEqual(config.DATA.IMAGE_SIZE, 256)
+        self.assertEqual(config.DATA.DATA_PATH, '/cifar10/')
+        self.assertEqual(config.EVAL, True)
+        self.assertEqual(config.DATA.BATCH_SIZE_EVAL, 128)
+        self.assertEqual(config.MODEL.PRETRAINED, 'pretrained')
+
+    def test_update_config_from_file(self):
+        config = get_config()
+        self.args.cfg = './tests/test_config.yaml'
+        self.args.image_size = None
+        self.args.ngpus = None
+        config = update_config(config, self.args)
+
+        self.assertEqual(config.DATA.IMAGE_SIZE, 384)
+        self.assertEqual(config.DATA.CROP_PCT, 1.0)
+
+        self.assertEqual(config.MODEL.TRANS.PATCH_SIZE, 16)
+        self.assertEqual(config.MODEL.TRANS.EMBED_DIM, 768)
+        self.assertEqual(config.MODEL.TRANS.MLP_RATIO, 4.0)
+        self.assertEqual(config.MODEL.TRANS.DEPTH, 12)
+        self.assertEqual(config.MODEL.TRANS.NUM_HEADS, 12)
+        self.assertEqual(config.MODEL.TRANS.QKV_BIAS, True)
+
+        self.assertEqual(config.MODEL.NAME, 'vit_base_patch16_224')
+        self.assertEqual(config.MODEL.TYPE, 'ViT')
+
+    def test_get_config(self):
+        config1 = get_config()
+        config2 = get_config()
+        self.assertEqual(config1, config2)
diff --git a/image_classification/ViT/tests/test_config.yaml b/image_classification/ViT/tests/test_config.yaml
new file mode 100644
index 00000000..19709906
--- /dev/null
+++ b/image_classification/ViT/tests/test_config.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: ViT
+    NAME: vit_base_patch16_224
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 768
+        MLP_RATIO: 4.0
+        DEPTH: 12
+        NUM_HEADS: 12
+        QKV_BIAS: true
+
diff --git a/image_classification/ViT/tests/test_datasets.py b/image_classification/ViT/tests/test_datasets.py
new file mode 100644
index 00000000..79952137
--- /dev/null
+++ b/image_classification/ViT/tests/test_datasets.py
@@ -0,0 +1,147 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import argparse
+from config import *
+from datasets import *
+from paddle.io import DataLoader
+#from multiprocessing import SimpleQueue
+
+#paddle.set_device('cpu')
+
+class DatasetTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        parser = argparse.ArgumentParser('')
+        parser.add_argument('-cfg', type=str, default=None)
+        parser.add_argument('-dataset', type=str, default='imagenet2012')
+        parser.add_argument('-batch_size', type=int, default=4)
+        parser.add_argument('-image_size', type=int, default=224)
+        parser.add_argument('-ngpus', type=int, default=None)
+        parser.add_argument('-data_path', type=str, default='/dataset/imagenet')
+        parser.add_argument('-eval', action='store_true')
+        parser.add_argument('-pretrained', type=str, default=None)
+        parser.add_argument('-resume', type=str, default=None)
+        parser.add_argument('-last_epoch', type=int, default=None)
+        cls.args = parser.parse_args()
+        cls.config = get_config()
+        cls.config = update_config(cls.config, cls.args)
+
+        cls.dataset_train = get_dataset(DatasetTest.config, mode='train')
+        cls.dataset_test = get_dataset(DatasetTest.config, mode='val')
+
+    @classmethod 
+    def tearDown(cls):
+        pass
+
+    @unittest.skip('skip for debug')
+    def test_shape(self):
+        sample = next(iter(DatasetTest.dataset_train))
+        self.assertEqual([3, 224, 224], sample[0].shape)
+
+        sample = next(iter(DatasetTest.dataset_test))
+        self.assertEqual([3, 224, 224], sample[0].shape)
+    
+    @unittest.skip('skip for debug')
+    def test_scaling(self):
+        sample = next(iter(DatasetTest.dataset_train))[0]
+        self.assertTrue(paddle.any(sample < 0))
+        self.assertTrue(paddle.any(sample > 0))
+        self.assertGreaterEqual(1, sample.max().cpu().numpy())
+        self.assertLessEqual(-1, sample.min().cpu().numpy())
+
+        sample = next(iter(DatasetTest.dataset_test))[0]
+        self.assertGreaterEqual(1, sample.max().cpu().numpy())
+        self.assertLessEqual(-1, sample.min().cpu().numpy())
+        self.assertTrue(paddle.any(sample < 0))
+        self.assertTrue(paddle.any(sample > 0))
+
+    @unittest.skip('skip for debug')
+    def test_single_process_dataloader(self):
+        self._test_loader(DatasetTest.dataset_train, 'train', False)
+        self._test_loader(DatasetTest.dataset_test, 'test', False)
+
+    def _test_loader(self, dataset, mode, multi_process):
+        dataloader = get_dataloader(DatasetTest.config,
+                                    dataset,
+                                    mode=mode,
+                                    multi_process=multi_process)
+        for idx, _ in enumerate(dataloader):
+            if idx > 0 and idx % 1 == 0:
+                print(f'----- test single process dataloader: {idx}/{len(dataloader)}')
+            if idx == 10:
+                return
+
+    @unittest.skip('skip for debug')
+    def test_multi_process_dataloader(self):
+        tester = Tester()
+        tester.run()
+        self.assertEqual(tester.n_samples, 50000) 
+            
+
+
+
+class Tester:
+    def __init__(self):
+        parser = argparse.ArgumentParser('')
+        parser.add_argument('-cfg', type=str, default=None)
+        parser.add_argument('-dataset', type=str, default='imagenet2012')
+        parser.add_argument('-batch_size', type=int, default=256)
+        parser.add_argument('-image_size', type=int, default=224)
+        parser.add_argument('-data_path', type=str, default='/dataset/imagenet/')
+        parser.add_argument('-eval', action='store_false') # set test batch size
+        parser.add_argument('-pretrained', type=str, default=None)
+        args = parser.parse_args()
+        self.config = get_config()
+        self.config = update_config(self.config, args)
+        self.dataset_train = get_dataset(self.config, mode='train')
+        self.dataset_test = get_dataset(self.config, mode='val')
+        self.n_samples = 0
+
+    def run(self, mode='test'):
+        # https://github.com/PaddlePaddle/Paddle/blob/5d8e4395b61929627151f6fd4a607589288a78bf/python/paddle/distributed/spawn.py#L272
+        context = dist.spawn(self.main_worker, args=(mode,))
+        self.n_samples = context.return_queues[0].get()
+        print(f'----- total samples: {self.n_samples}')
+
+    def main_worker(self, *args):
+        mode = args[0]
+        dist.init_parallel_env()
+        local_rank = dist.get_rank()
+        if mode == 'train':
+            n_samples = self._test_loader(self.config, self.dataset_train, 'train', True) 
+        else:
+            n_samples = self._test_loader(self.config, self.dataset_test, 'test', True) 
+
+        n_samples = paddle.to_tensor(np.array([n_samples]))
+        dist.reduce(n_samples, 0)
+        if local_rank == 0:
+            return n_samples.cpu().numpy()
+
+
+    def _test_loader(self, config, dataset, mode, multi_process):
+        n_samples = 0
+        dataloader = get_dataloader(config,
+                                    dataset,
+                                    mode=mode,
+                                    multi_process=multi_process)
+        local_rank = dist.get_rank()
+        for idx, data in enumerate(dataloader):
+            if idx > 0 and idx % 1 == 0:
+                print(f'----- test single process({local_rank}) dataloader: {idx}/{len(dataloader)}')
+                #print(local_rank, data[1])
+            n_samples += data[0].shape[0] 
+
+        return n_samples
diff --git a/image_classification/ViT/tests/test_transformer.py b/image_classification/ViT/tests/test_transformer.py
new file mode 100644
index 00000000..32d82d98
--- /dev/null
+++ b/image_classification/ViT/tests/test_transformer.py
@@ -0,0 +1,105 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+from config import *
+from transformer import build_vit
+from transformer import PatchEmbedding
+from transformer import Attention
+from transformer import Mlp
+from transformer import Encoder
+
+
+class TransformerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+        cls.config = get_config()
+        cls.dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        cls.dummy_tensor = paddle.to_tensor(cls.dummy_img)
+        cls.vit = build_vit(cls.config)
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+
+    @unittest.skip('skip for debug') 
+    def test_out_shape(self):
+        logits, _ = TransformerTest.vit(TransformerTest.dummy_tensor)
+        self.assertEqual(logits.shape, [4, 1000])
+
+    @unittest.skip('skip for debug') 
+    def test_all_parameters_updated(self):
+        optim = paddle.optimizer.SGD(parameters=TransformerTest.vit.parameters(), learning_rate=0.1)
+        out, _ = TransformerTest.vit(TransformerTest.dummy_tensor)
+        loss = out.mean()
+        loss.backward()
+        optim.step()
+
+        for name, param in TransformerTest.vit.named_parameters():
+            if not param.stop_gradient:
+                self.assertIsNotNone(param.gradient())
+                self.assertNotEqual(0, np.sum(param.gradient()**2))
+
+    @unittest.skip('skip for debug') 
+    def test_embeddings(self):
+        embed = PatchEmbedding()
+        dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+
+        patch_out = embed.patch_embeddings(dummy_tensor)
+        embed_out = embed(dummy_tensor)
+        self.assertEqual(patch_out.shape, [4, 768, 7, 7])
+        self.assertEqual(embed.cls_token.shape, [1, 1, 768])
+        self.assertEqual(embed_out.shape, [4, 50, 768])
+
+    @unittest.skip('skip for debug') 
+    def test_attention(self):
+        attn_op = Attention(
+            TransformerTest.config.MODEL.TRANS.EMBED_DIM,
+            TransformerTest.config.MODEL.TRANS.NUM_HEADS,
+            TransformerTest.config.MODEL.TRANS.QKV_BIAS)
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+
+        out, attn = attn_op(dummy_tensor)
+        self.assertEqual(attn.shape, [4, 12, 50, 50])
+        self.assertEqual(out.shape, [4, 50, 768])
+
+    def test_mlp(self):
+        mlp_op = Mlp(
+            TransformerTest.config.MODEL.TRANS.EMBED_DIM,
+            TransformerTest.config.MODEL.TRANS.MLP_RATIO)
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+
+        out = mlp_op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+        
+    def test_encoder(self):
+        encoder_op = Encoder(
+            TransformerTest.config.MODEL.TRANS.EMBED_DIM,
+            TransformerTest.config.MODEL.TRANS.NUM_HEADS,
+            TransformerTest.config.MODEL.TRANS.DEPTH,
+        )
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+
+        out, _ = encoder_op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+
diff --git a/image_classification/ViT/tests/test_utils.py b/image_classification/ViT/tests/test_utils.py
new file mode 100644
index 00000000..18c579f3
--- /dev/null
+++ b/image_classification/ViT/tests/test_utils.py
@@ -0,0 +1,94 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.nn as nn
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from utils import get_exclude_from_weight_decay_fn
+
+
+class UtilTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        pass
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    def test_average_meter(self):
+        meter = AverageMeter()
+        for i in range(1, 101):
+            meter.update(i, 1)
+        self.assertEqual(meter.avg, 50.5)
+
+    def test_warmup_cosine_scheduler(self):
+        sch = WarmupCosineScheduler(learning_rate=0.1,
+                                    warmup_start_lr=1e-5,
+                                    start_lr=0.1,
+                                    end_lr=0.0,
+                                    warmup_epochs=10,
+                                    total_epochs=100,
+                                    last_epoch=-1)
+        lrs = []
+        for epoch in range(100):
+            lr = sch.get_lr()
+            lrs.append(lr)
+            sch.step()
+        lrs.append(sch.get_lr())
+
+        self.assertEqual(lrs[0], 1e-5)
+        self.assertEqual(lrs[10], 0.1)
+        self.assertEqual(lrs[-1], 0.0)
+        self.assertGreaterEqual(min(lrs[0:10]), 1e-5)
+        self.assertLessEqual(max(lrs[0:10]), 0.1)
+        self.assertGreaterEqual(min(lrs[10::]), 0.0)
+        self.assertLessEqual(max(lrs[10::]), 0.1)
+            
+    def test_warmup_cosine_scheduler_last_epoch(self):
+        sch = WarmupCosineScheduler(learning_rate=0.1,
+                                    warmup_start_lr=1e-5,
+                                    start_lr=0.1,
+                                    end_lr=0.0,
+                                    warmup_epochs=10,
+                                    total_epochs=100,
+                                    last_epoch=9)
+        lrs = []
+        for epoch in range(10, 100):
+            lr = sch.get_lr()
+            lrs.append(lr)
+            sch.step()
+        lrs.append(sch.get_lr())
+
+        self.assertEqual(lrs[0], 0.1)
+        self.assertEqual(lrs[-1], 0.0)
+        self.assertGreaterEqual(min(lrs[::]), 0.0)
+        self.assertLessEqual(max(lrs[::]), 0.1)
+
+    def test_get_exclude_from_weight_decay_fn(self):
+        model = nn.Linear(10, 100, bias_attr=True)
+        exclude_list = ['bias']
+        fn = get_exclude_from_weight_decay_fn(exclude_list)
+        # should return false if name in exclude_list 
+        for name, param in model.named_parameters():
+            if name.endswith('weight'):
+                self.assertTrue(fn(name))
+            elif name.endswith('bias'):
+                self.assertFalse(fn(name))
+
+
+        
+
diff --git a/image_classification/ViT/transformer.py b/image_classification/ViT/transformer.py
new file mode 100644
index 00000000..24135988
--- /dev/null
+++ b/image_classification/ViT/transformer.py
@@ -0,0 +1,435 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement Transformer Class for ViT
+"""
+
+import copy
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+from config import get_config
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+    The output of this layer is the input without any change.
+    Use this layer to avoid using 'if' condition in forward methods
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embedding and Position Embedding
+
+    Apply patch embedding and position embedding on input images.
+
+    Attributes:
+        patch_embddings: impl using a patch_size x patch_size Conv2D operation
+        position_embddings: a parameter with len = num_patch + 1(for cls_token)
+        cls_token: token insert to the patch feature for classification
+        dropout: dropout for embeddings
+    """
+
+    def __init__(self,
+                 image_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768,
+                 dropout=0.):
+        super().__init__()
+        n_patches = (image_size // patch_size) * (image_size // patch_size)
+
+        self.patch_embedding = nn.Conv2D(in_channels=in_channels,
+                                         out_channels=embed_dim,
+                                         kernel_size=patch_size,
+                                         stride=patch_size)
+
+        self.position_embeddings = paddle.create_parameter(
+            shape=[1, n_patches+1, embed_dim],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))
+
+        self.cls_token = paddle.create_parameter(
+            shape=[1, 1, embed_dim],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Constant(0))
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        cls_tokens = self.cls_token.expand((x.shape[0], -1, -1))
+        x = self.patch_embedding(x)
+        x = x.flatten(2)
+        x = x.transpose([0, 2, 1])
+        x = paddle.concat((cls_tokens, x), axis=1)
+
+        embeddings = x + self.position_embeddings # tensor broadcast
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class Attention(nn.Layer):
+    """ Attention module
+
+    Attention module for ViT, here q, k, v are assumed the same.
+    The qkv mappings are stored as one single param.
+
+    Attributes:
+        num_heads: number of heads
+        attn_head_size: feature dim of single head
+        all_head_size: feature dim of all heads
+        qkv: a nn.Linear for q, k, v mapping
+        scales: 1 / sqrt(single_head_feature_dim)
+        out: projection of multi-head attention
+        attn_dropout: dropout for attention
+        proj_dropout: final dropout before output
+        softmax: softmax op for attention
+    """
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 qkv_bias=True,
+                 dropout=0.,
+                 attention_dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads 
+        self.attn_head_size = int(embed_dim / self.num_heads)
+        self.all_head_size = self.attn_head_size * self.num_heads
+
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.qkv = nn.Linear(embed_dim,
+                             self.all_head_size*3, #weights for q, k, and v
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1 if qkv_bias else False)
+
+        self.scales = self.attn_head_size ** -0.5
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.out = nn.Linear(embed_dim,
+                             embed_dim,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+
+    def transpose_multihead(self, x):
+        new_shape = x.shape[:-1] + [self.num_heads, self.attn_head_size]
+        x = x.reshape(new_shape)
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def forward(self, x):
+        qkv = self.qkv(x).chunk(3, axis=-1)
+        q, k, v = map(self.transpose_multihead, qkv)
+
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = attn * self.scales
+        attn = self.softmax(attn)
+        attn_weights = attn
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 2, 1, 3])
+        new_shape = z.shape[:-2] + [self.all_head_size]
+        z = z.reshape(new_shape)
+        # reshape
+        z = self.out(z)
+        z = self.proj_dropout(z)
+        return z, attn_weights
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+    def __init__(self,
+                 embed_dim,
+                 mlp_ratio,
+                 dropout=0.):
+        super().__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(embed_dim,
+                             int(embed_dim * mlp_ratio),
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(int(embed_dim * mlp_ratio),
+                             embed_dim,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierUniform()) #default in pp: xavier
+        bias_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Normal(std=1e-6)) #default in pp: zero
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout1(x)
+        x = self.fc2(x)
+        x = self.dropout2(x)
+        return x
+
+
+class EncoderLayer(nn.Layer):
+    """Encoder Layer
+
+    Encoder layer contains attention, norm, mlp and residual
+
+    Attributes:
+        hidden_size: transformer feature dim
+        attn_norm: nn.LayerNorm before attention
+        mlp_norm: nn.LayerNorm before mlp
+        mlp: mlp modual
+        attn: attention modual
+    """
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 qkv_bias=True,
+                 mlp_ratio=4.,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.attn_norm = nn.LayerNorm(embed_dim,
+                                      weight_attr=w_attr_1,
+                                      bias_attr=b_attr_1,
+                                      epsilon=1e-6)
+
+        self.attn = Attention(embed_dim,
+                              num_heads,
+                              qkv_bias,
+                              dropout,
+                              attention_dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.mlp_norm = nn.LayerNorm(embed_dim,
+                                     weight_attr=w_attr_2,
+                                     bias_attr=b_attr_2,
+                                     epsilon=1e-6)
+
+        self.mlp = Mlp(embed_dim, mlp_ratio, dropout)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0))
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1.0))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        h = x
+        x = self.attn_norm(x)
+        x, attn = self.attn(x)
+        x = self.drop_path(x)
+        x = x + h
+
+        h = x
+        x = self.mlp_norm(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = x + h
+
+        return x, attn
+
+
+class Encoder(nn.Layer):
+    """Transformer encoder
+
+    Encoder encoder contains a list of EncoderLayer, and a LayerNorm.
+
+    Attributes:
+        layers: nn.LayerList contains multiple EncoderLayers
+        encoder_norm: nn.LayerNorm which is applied after last encoder layer
+    """
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 depth,
+                 qkv_bias=True,
+                 mlp_ratio=4.0,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super(Encoder, self).__init__()
+        # stochatic depth decay
+        depth_decay = [x.item() for x in paddle.linspace(0, droppath, depth)]
+        layer_list = []
+        for i in range(depth):
+            encoder_layer = EncoderLayer(embed_dim,
+                                         num_heads,
+                                         qkv_bias=True,
+                                         mlp_ratio=4.,
+                                         dropout=0.,
+                                         attention_dropout=0.,
+                                         droppath=depth_decay[i])
+            layer_list.append(copy.deepcopy(encoder_layer))
+        self.layers = nn.LayerList(layer_list)
+
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.encoder_norm = nn.LayerNorm(embed_dim,
+                                         weight_attr=w_attr_1,
+                                         bias_attr=b_attr_1,
+                                         epsilon=1e-6)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0))
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1.0))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        self_attn = []
+        for layer in self.layers:
+            x, attn = layer(x)
+            self_attn.append(attn)
+        out = self.encoder_norm(x)
+        return out, self_attn
+
+
+class VisualTransformer(nn.Layer):
+    """ViT transformer
+
+    ViT Transformer, classifier is a single Linear layer for finetune,
+    For training from scratch, two layer mlp should be used.
+    Classification is done using cls_token.
+
+    Args:
+        image_size: int, input image size, default: 224
+        patch_size: int, patch size, default: 16
+        in_channels: int, input image channels, default: 3
+        num_classes: int, number of classes for classification, default: 1000
+        embed_dim: int, embedding dimension (patch embed out dim), default: 768
+        depth: int, number ot transformer blocks, default: 12
+        num_heads: int, number of attention heads, default: 12
+        mlp_ratio: float, ratio of mlp hidden dim to embed dim(mlp in dim), default: 4.0
+        qkv_bias: bool, If True, enable qkv(nn.Linear) layer with bias, default: True
+        dropout: float, dropout rate for linear layers, default: 0.
+        attention_dropout: float, dropout rate for attention layers default: 0.
+        droppath: float, droppath rate for droppath layers, default: 0.
+    """
+    def __init__(self,
+                 image_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 num_classes=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=True,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.,
+                 train_from_scratch=False):
+        super(VisualTransformer, self).__init__()
+        # create patch embedding with positional embedding
+        self.patch_embedding = PatchEmbedding(image_size,
+                                              patch_size,
+                                              in_channels,
+                                              embed_dim,
+                                              dropout)
+        # create multi head self-attention layers
+        self.encoder = Encoder(embed_dim,
+                               num_heads,
+                               depth,
+                               qkv_bias,
+                               mlp_ratio,
+                               dropout,
+                               attention_dropout,
+                               droppath)
+
+        # classifier head (for training from scracth)
+        if train_from_scratch:
+            w_attr_1, b_attr_1 = self._init_weights()
+            w_attr_2, b_attr_2 = self._init_weights()
+            self.classifier = nn.Sequential(
+                                nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE,
+                                          config.MODEL.TRANS.HIDDEN_SIZE,
+                                          weight_attr=w_attr_1,
+                                          bias_attr=b_attr_1),
+                                nn.ReLU(),
+                                nn.Dropout(config.MODEL.DROPOUT),
+                                nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE,
+                                          config.MODEL.NUM_CLASSES,
+                                          weight_attr=w_attr_2,
+                                          bias_attr=b_attr_2),
+                                nn.Dropout(config.MODEL.DROPOUT),
+                                )
+        else:
+        # classifier head (for finetuning)
+            w_attr_1, b_attr_1 = self._init_weights()
+            self.classifier = nn.Linear(embed_dim,
+                                        num_classes,
+                                        weight_attr=w_attr_1,
+                                        bias_attr=b_attr_1)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.patch_embedding(x)
+        x, attn = self.encoder(x)
+        logits = self.classifier(x[:, 0]) # take only cls_token as classifier
+        return logits
+
+
+def build_vit(config):
+    model = VisualTransformer(image_size=config.DATA.IMAGE_SIZE,
+                              patch_size=config.MODEL.TRANS.PATCH_SIZE,
+                              in_channels=3,
+                              num_classes=config.MODEL.NUM_CLASSES,
+                              embed_dim=config.MODEL.TRANS.EMBED_DIM,
+                              depth=config.MODEL.TRANS.DEPTH,
+                              num_heads=config.MODEL.TRANS.NUM_HEADS,
+                              mlp_ratio=config.MODEL.TRANS.MLP_RATIO,
+                              qkv_bias=config.MODEL.TRANS.QKV_BIAS,
+                              dropout=config.MODEL.DROPOUT,
+                              attention_dropout=config.MODEL.ATTENTION_DROPOUT,
+                              droppath=config.MODEL.DROPPATH,
+                              train_from_scratch=False)
+    return model
diff --git a/image_classification/ViT/utils.py b/image_classification/ViT/utils.py
new file mode 100644
index 00000000..44800527
--- /dev/null
+++ b/image_classification/ViT/utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/image_classification/ViT/vit.png b/image_classification/ViT/vit.png
new file mode 100644
index 00000000..a6929f74
Binary files /dev/null and b/image_classification/ViT/vit.png differ
diff --git a/image_classification/gMLP/README.md b/image_classification/gMLP/README.md
new file mode 100644
index 00000000..7c759ebb
--- /dev/null
+++ b/image_classification/gMLP/README.md
@@ -0,0 +1,163 @@
+# Pay Attention to MLPs, [arxiv](https://arxiv.org/abs/2105.08050) 
+
+PaddlePaddle training/validation code and pretrained models for **gMLP**.
+
+The 3rd party pytorch implementation is [here](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/mlp_mixer.py).
+
+This implementation is developed by [PaddleViT](https://github.com/BR-IDL/PaddleViT.git).
+
+
+<p align="center">
+<img src="./gmlp.png" alt="drawing" width="100%" height="100%"/>
+    <h4 align="center">gMLP Model Overview</h4>
+</p>
+
+
+### Update 
+Update (2021-08-11): Code is released and ported weights are uploaded.
+
+## Models Zoo
+| Model                          | Acc@1 | Acc@5 | Image Size | Crop_pct | Interpolation | Link        |
+|--------------------------------|-------|-------|------------|----------|---------------|--------------|
+| gmlp_s16_224                   | 79.64 | 94.63 | 224        | 0.875    | bicubic       | [google](https://drive.google.com/file/d/1TLypFly7aW0oXzEHfeDSz2Va4RHPRqe5/view?usp=sharing)/[baidu](https://pan.baidu.com/s/13UUz1eGIKyqyhtwedKLUMA)(bcth) |
+
+> *The results are evaluated on ImageNet2012 validation set.
+> 
+> Note: gMLP weights are ported from [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/mlp_mixer.py)
+## Notebooks
+We provide a few notebooks in aistudio to help you get started:
+
+**\*(coming soon)\***
+
+
+## Requirements
+- Python>=3.6
+- yaml>=0.2.5
+- [PaddlePaddle](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)>=2.1.0
+- [yacs](https://github.com/rbgirshick/yacs)>=0.1.8
+
+## Data 
+ImageNet2012 dataset is used in the following folder structure:
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Usage
+To use the model with pretrained weights, download the `.pdparam` weight file and change related file paths in the following python scripts. The model config files are located in `./configs/`.
+
+For example, assume the downloaded weight file is stored in `./gmlp_s16_224.pdparams`, to use the `gmlp_s16_224` model in python:
+```python
+from config import get_config
+from gmlp import build_gated_mlp as build_model
+# config files in ./configs/
+config = get_config('./configs/gmlp_s16_224.yaml')
+# build model
+model = build_model(config)
+# load pretrained weights, .pdparams is NOT needed
+model_state_dict = paddle.load('./gmlp_s16_224')
+model.set_dict(model_state_dict)
+```
+
+## Evaluation
+To evaluate gMLP model performance on ImageNet2012 with a single GPU, run the following script using command line:
+```shell
+sh run_eval.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+    -cfg='./configs/gmlp_s16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./gmlp_s16_224'
+```
+
+<details>
+
+<summary>
+Run evaluation using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_eval_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/gmlp_s16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+    -eval \
+    -pretrained='./gmlp_s16_224'
+```
+
+</details>
+
+## Training
+To train the gMLP Transformer model on ImageNet2012 with single GPU, run the following script using command line:
+```shell
+sh run_train.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+python main_single_gpu.py \
+  -cfg='./configs/gmlp_s16_224.yaml' \
+  -dataset='imagenet2012' \
+  -batch_size=32 \
+  -data_path='/dataset/imagenet' \
+```
+
+<details>
+
+<summary>
+Run training using multi-GPUs:
+</summary>
+
+
+```shell
+sh run_train_multi.sh
+```
+or
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python main_multi_gpu.py \
+    -cfg='./configs/gmlp_s16_224.yaml' \
+    -dataset='imagenet2012' \
+    -batch_size=16 \
+    -data_path='/dataset/imagenet' \
+```
+
+</details>
+
+
+## Visualization Attention Map
+**(coming soon)**
+
+## Reference
+```
+@article{zhang2021gmlp,
+  title={GMLP: Building Scalable and Flexible Graph Neural Networks with Feature-Message Passing},
+  author={Zhang, Wentao and Shen, Yu and Lin, Zheyu and Li, Yang and Li, Xiaosen and Ouyang, Wen and Tao, Yangyu and Yang, Zhi and Cui, Bin},
+  journal={arXiv preprint arXiv:2104.09880},
+  year={2021}
+}
+```
diff --git a/image_classification/gMLP/config.py b/image_classification/gMLP/config.py
new file mode 100644
index 00000000..b6db78e0
--- /dev/null
+++ b/image_classification/gMLP/config.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+
+"""
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 256 #256 # train batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #64 # val batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size: 224 for pretrain, 384 for finetune
+_C.DATA.CROP_PCT = 1.0 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 4 # number of data loading threads 
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'GatedMLP'
+_C.MODEL.NAME = 'GatedMLP'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.1
+_C.MODEL.DROPPATH = 0.1
+
+# transformer settings
+_C.MODEL.MIXER = CN()
+_C.MODEL.MIXER.PATCH_SIZE = 16
+_C.MODEL.MIXER.HIDDEN_SIZE = 256
+_C.MODEL.MIXER.NUM_LAYERS = 30
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size
+_C.TRAIN.WEIGHT_DECAY = 0.01 #0.3 # 0.0 for finetune
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0
+_C.TRAIN.END_LR = 1e-5
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 20 # freq to save chpt
+_C.REPORT_FREQ = 50 # freq to logging info
+_C.VALIDATE_FREQ = 20 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = 1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/gMLP/configs/gmlp_b16_224.yaml b/image_classification/gMLP/configs/gmlp_b16_224.yaml
new file mode 100644
index 00000000..11f2d6ea
--- /dev/null
+++ b/image_classification/gMLP/configs/gmlp_b16_224.yaml
@@ -0,0 +1,12 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: GatedMLP
+    NAME: gmlp_b16_224
+    MIXER:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 512
+        NUM_LAYERS: 30
+        MLP_RATIO: 6.0
+
diff --git a/image_classification/gMLP/configs/gmlp_s16_224.yaml b/image_classification/gMLP/configs/gmlp_s16_224.yaml
new file mode 100644
index 00000000..75046232
--- /dev/null
+++ b/image_classification/gMLP/configs/gmlp_s16_224.yaml
@@ -0,0 +1,11 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: GatedMLP
+    NAME: gmlp_s16_224
+    MIXER:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 256
+        NUM_LAYERS: 30
+
diff --git a/image_classification/gMLP/datasets.py b/image_classification/gMLP/datasets.py
new file mode 100644
index 00000000..e207f9ba
--- /dev/null
+++ b/image_classification/gMLP/datasets.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/gMLP/droppath.py b/image_classification/gMLP/droppath.py
new file mode 100644
index 00000000..fcff05e9
--- /dev/null
+++ b/image_classification/gMLP/droppath.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import paddle
+import paddle.nn as nn
+
+def drop_path(inputs, drop_prob=0., training=False):
+    """drop path op
+    Args:
+        input: tensor with arbitrary shape
+        drop_prob: float number of drop path probability, default: 0.0
+        training: bool, if current mode is training, default: False
+    Returns:
+        output: output tensor after drop path
+    """
+    # if prob is 0 or eval mode, return original input
+    if drop_prob == 0. or not training:
+        return inputs
+    keep_prob = 1 - drop_prob
+    shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+    random_tensor = random_tensor.floor() # mask
+    output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation
+    return output
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, inputs):
+        return drop_path(inputs, self.drop_prob, self.training)
diff --git a/image_classification/gMLP/gmlp.png b/image_classification/gMLP/gmlp.png
new file mode 100644
index 00000000..db2c08ef
Binary files /dev/null and b/image_classification/gMLP/gmlp.png differ
diff --git a/image_classification/gMLP/gmlp.py b/image_classification/gMLP/gmlp.py
new file mode 100644
index 00000000..47686075
--- /dev/null
+++ b/image_classification/gMLP/gmlp.py
@@ -0,0 +1,204 @@
+import math
+import copy
+from functools import partial
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embeddings
+
+    Apply patch embeddings on input images. Embeddings is implemented using a Conv2D op.
+
+    Attributes:
+        image_size: int, input image size, default: 224
+        patch_size: int, size of patch, default: 4
+        in_channels: int, input image channels, default: 3
+        embed_dim: int, embedding dimension, default: 96
+    """
+
+    def __init__(self, image_size=224, patch_size=4, in_channels=3, embed_dim=96, norm_layer=None):
+        super(PatchEmbedding, self).__init__()
+        image_size = (image_size, image_size)
+        patch_size = (patch_size, patch_size)
+        patches_resolution = [image_size[0]//patch_size[0], image_size[1]//patch_size[1]]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        self.patch_embed = nn.Conv2D(in_channels=in_channels,
+                                     out_channels=embed_dim,
+                                     kernel_size=patch_size,
+                                     stride=patch_size)
+        self.norm = norm_layer if norm_layer is not None else Identity()
+
+    def forward(self, x):
+        x = self.patch_embed(x) # [batch, embed_dim, h, w] h,w = patch_resolution
+        x = x.flatten(start_axis=2, stop_axis=-1) # [batch, embed_dim, h*w] h*w = num_patches
+        x = x.transpose([0, 2, 1]) # [batch, h*w, embed_dim]
+        x = self.norm(x) # [batch, num_patches, embed_dim]
+        return x
+
+
+class GMlp(nn.Layer):
+    """ GatedMLP module
+    
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> gate -> fc -> dropout
+    
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        gate: gate layer
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+    
+    def __init__(self, in_features, hidden_features, gate_layer=None, dropout=0.):
+        super(GMlp, self).__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+        if gate_layer is not None:
+            assert hidden_features % 2 == 0
+            self.gate = gate_layer(hidden_features)
+            hidden_features = hidden_features // 2
+        else:
+            self.gate = Identity()
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             in_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+    
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+    
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.gate(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class SpatialGatingUnit(nn.Layer):
+    def __init__(self, dim, seq_len):
+        super(SpatialGatingUnit, self).__init__()
+        gate_dim = dim // 2
+        self.norm = nn.LayerNorm(gate_dim, epsilon=1e-6)
+        w_attr, b_attr = self._init_weights()
+        self.proj = nn.Linear(seq_len,
+                              seq_len,
+                              weight_attr=w_attr,
+                              bias_attr=b_attr)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(std=1e-6))
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        u, v = x.chunk(2, axis=-1)
+        v = self.norm(v)
+        v = self.proj(v.transpose([0, 2, 1]))
+        return u * v.transpose([0, 2, 1])
+
+
+class SpatialGatingBlock(nn.Layer):
+    def __init__(self, dim, seq_len, mlp_ratio=4, dropout=0., droppath=0.):
+        super(SpatialGatingBlock, self).__init__()
+        channels_dim = int(mlp_ratio * dim)
+        self.norm = nn.LayerNorm(dim, epsilon=1e-6)
+        sgu = partial(SpatialGatingUnit, seq_len=seq_len)
+        self.mlp_channels = GMlp(dim, channels_dim, gate_layer=sgu, dropout=dropout)
+        self.drop_path = DropPath(droppath)
+
+    def forward(self, x):
+        h = x
+        x = self.norm(x)
+        x = self.mlp_channels(x)
+        x = self.drop_path(x)
+        x = x + h
+
+        return x
+
+
+class GatedMlp(nn.Layer):
+    def __init__(self,
+                 num_classes=1000,
+                 image_size=224,
+                 in_channels=3,
+                 patch_size=16,
+                 num_mixer_layers=30,
+                 embed_dim=256,
+                 mlp_ratio=6,
+                 dropout=0.,
+                 droppath=0.,
+                 patch_embed_norm=False):
+        super(GatedMlp, self).__init__()
+        self.num_classes = num_classes
+        self.num_features = embed_dim
+        self.embed_dim = embed_dim
+
+        norm_layer=nn.LayerNorm(embed_dim, epsilon=1e-6)
+        self.patch_embed = PatchEmbedding(
+            image_size=image_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if patch_embed_norm else None)
+
+        self.mixer_layers = nn.Sequential(
+            *[SpatialGatingBlock(
+                embed_dim,
+                self.patch_embed.num_patches,
+                mlp_ratio,
+                dropout,
+                droppath) for _ in range(num_mixer_layers)])
+
+        self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
+        self.head = nn.Linear(embed_dim, self.num_classes)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = self.mixer_layers(x)
+        x = self.norm(x)
+        x = x.mean(axis=1)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def build_gated_mlp(config):
+    model = GatedMlp(num_classes=config.MODEL.NUM_CLASSES,
+                     image_size=config.DATA.IMAGE_SIZE,
+                     in_channels=3,
+                     num_mixer_layers=config.MODEL.MIXER.NUM_LAYERS,
+                     embed_dim=config.MODEL.MIXER.HIDDEN_SIZE,
+                     mlp_ratio=6,
+                     dropout=config.MODEL.DROPOUT,
+                     droppath=config.MODEL.DROPPATH)
+    return model
diff --git a/image_classification/gMLP/main_multi_gpu.py b/image_classification/gMLP/main_multi_gpu.py
new file mode 100644
index 00000000..4189e737
--- /dev/null
+++ b/image_classification/gMLP/main_multi_gpu.py
@@ -0,0 +1,365 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""gMLP training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader, get_dataset
+from gmlp import build_gated_mlp as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('gMLP')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn([
+            #    'absolute_pos_embed', 'relative_position_bias_table']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/gMLP/main_single_gpu.py b/image_classification/gMLP/main_single_gpu.py
new file mode 100644
index 00000000..fa11a1f4
--- /dev/null
+++ b/image_classification/gMLP/main_single_gpu.py
@@ -0,0 +1,334 @@
+
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""gMLP training/validation using single GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import get_dataloader
+from datasets import get_dataset
+from gmlp import build_gated_mlp as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('gMLP')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, args)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        output = model(image)
+        loss = criterion(output, label)
+
+        #NOTE: division may be needed depending on the loss function
+        # Here no division is needed:
+        # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+        #loss =  loss / accum_iter
+
+        loss.backward()
+
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            batch_size = image.shape[0]
+            val_loss_meter.update(loss.numpy()[0], batch_size)
+            val_acc1_meter.update(acc1.numpy()[0], batch_size)
+            val_acc5_meter.update(acc5.numpy()[0], batch_size)
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    #paddle.set_device('gpu:0')
+    # 1. Create model
+    model = build_model(config)
+    #model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    dataloader_train = get_dataloader(config, dataset_train, 'train', False)
+    dataloader_val = get_dataloader(config, dataset_val, 'val', False)
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}")
+    # 7. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=len(dataloader_train),
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  )
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(
+                config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path + '.pdparams')
+            paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/gMLP/port_weights/load_pytorch_weights.py b/image_classification/gMLP/port_weights/load_pytorch_weights.py
new file mode 100644
index 00000000..84bf7540
--- /dev/null
+++ b/image_classification/gMLP/port_weights/load_pytorch_weights.py
@@ -0,0 +1,162 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import numpy as np
+import paddle
+import torch
+import timm
+from gmlp import build_gated_mlp
+from config import get_config
+from config import update_config
+
+config = get_config('./configs/gmlp_s16_224.yaml')
+print(config)
+
+
+def print_model_named_params(model):
+    print('----------------------------------')
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def print_model_named_buffers(model):
+    print('----------------------------------')
+    for name, param in model.named_buffers():
+        print(name, param.shape)
+    print('----------------------------------')
+
+
+def torch_to_paddle_mapping():
+    mapping = [
+        ('stem.proj', 'patch_embed.patch_embed'),
+        ('norm', 'norm'),
+        ('head', 'head'),
+    ]
+
+    num_layers = 30
+    for idx in range(num_layers):
+        pp_prefix = f'mixer_layers.{idx}'
+        th_prefix = f'blocks.{idx}'
+        layer_mapping = [
+            (f'{th_prefix}.norm', f'{pp_prefix}.norm'),
+            (f'{th_prefix}.mlp_channels.fc1', f'{pp_prefix}.mlp_channels.fc1'),
+            (f'{th_prefix}.mlp_channels.gate.norm', f'{pp_prefix}.mlp_channels.gate.norm'),
+            (f'{th_prefix}.mlp_channels.gate.proj', f'{pp_prefix}.mlp_channels.gate.proj'),
+            (f'{th_prefix}.mlp_channels.fc2', f'{pp_prefix}.mlp_channels.fc2'),
+        ]
+        mapping.extend(layer_mapping)
+
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'**SET** {th_name} {th_shape} **TO** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, param in paddle_model.named_buffers():
+        pd_params[name] = param
+    for name, param in torch_model.named_buffers():
+        th_params[name] = param
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            _set_value(th_name, pd_name)
+        else: # weight & bias
+            th_name_w = f'{th_name}.weight'
+            pd_name_w = f'{pd_name}.weight'
+            _set_value(th_name_w, pd_name_w)
+
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+    return paddle_model
+
+    
+
+
+
+def main():
+
+    paddle.set_device('cpu')
+    paddle_model = build_gated_mlp(config)
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+
+    print('+++++++++++++++++++++++++++++++++++')
+    device = torch.device('cpu')
+    torch_model = timm.create_model('gmlp_s16_224', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    # check correctness
+    x = np.random.randn(2, 3, 224, 224).astype('float32')
+    x_paddle = paddle.to_tensor(x)
+    x_torch = torch.Tensor(x).to(device)
+
+    out_torch = torch_model(x_torch)
+    out_paddle = paddle_model(x_paddle)
+
+    out_torch = out_torch.data.cpu().numpy()
+    out_paddle = out_paddle.cpu().numpy()
+
+    print(out_torch.shape, out_paddle.shape)
+    print(out_torch[0:100])
+    print(out_paddle[0:100])
+    assert np.allclose(out_torch, out_paddle, atol = 1e-2)
+    
+    # save weights for paddle model
+    model_path = os.path.join('./gmlp_s16_224.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+    print('all done')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/gMLP/run_eval.sh b/image_classification/gMLP/run_eval.sh
new file mode 100644
index 00000000..8f983060
--- /dev/null
+++ b/image_classification/gMLP/run_eval.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/gmlp_s16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./gmlp_s16_224'
diff --git a/image_classification/gMLP/run_eval_multi.sh b/image_classification/gMLP/run_eval_multi.sh
new file mode 100644
index 00000000..bd4f4898
--- /dev/null
+++ b/image_classification/gMLP/run_eval_multi.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/gmlp_s16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=8 \
+-data_path='/dataset/imagenet' \
+-eval \
+-pretrained='./gmlp_s16_224' \
+-ngpus=4
diff --git a/image_classification/gMLP/run_train.sh b/image_classification/gMLP/run_train.sh
new file mode 100644
index 00000000..2c394f3b
--- /dev/null
+++ b/image_classification/gMLP/run_train.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/gmlp_s16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
diff --git a/image_classification/gMLP/run_train_multi.sh b/image_classification/gMLP/run_train_multi.sh
new file mode 100644
index 00000000..2692f218
--- /dev/null
+++ b/image_classification/gMLP/run_train_multi.sh
@@ -0,0 +1,7 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/gmlp_s16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='/dataset/imagenet' \
+-ngpus=4
diff --git a/image_classification/gMLP/tests/__init__.py b/image_classification/gMLP/tests/__init__.py
new file mode 100644
index 00000000..84952a81
--- /dev/null
+++ b/image_classification/gMLP/tests/__init__.py
@@ -0,0 +1 @@
+# init
\ No newline at end of file
diff --git a/image_classification/gMLP/tests/test_gmlp.py b/image_classification/gMLP/tests/test_gmlp.py
new file mode 100644
index 00000000..34b3c5f5
--- /dev/null
+++ b/image_classification/gMLP/tests/test_gmlp.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+import numpy as np
+import paddle
+import paddle.nn as nn
+from config import *
+from gmlp import Identity
+from gmlp import PatchEmbedding
+from gmlp import GMlp
+from gmlp import SpatialGatingUnit
+from gmlp import SpatialGatingBlock
+from gmlp import GatedMlp
+from gmlp import build_gated_mlp
+
+
+class MlpTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+        cls.config = get_config()
+        cls.dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        cls.dummy_tensor = paddle.to_tensor(cls.dummy_img)
+        cls.model = build_gated_mlp(cls.config)
+
+    @classmethod
+    def tearDown(cls):
+        pass
+    
+    #@unittest.skip('skip for debug')
+    def test_out_shape(self):
+        out = MlpTest.model(MlpTest.dummy_tensor)
+        self.assertEqual(out.shape, [4, 1000])
+
+    #@unittest.skip('skip for debug')
+    def test_all_parameters_updated(self):
+        optim = paddle.optimizer.SGD(
+            parameters=MlpTest.model.parameters(), learning_rate=0.1)
+        out = MlpTest.model(MlpTest.dummy_tensor)
+        loss = out.mean()
+        loss.backward()
+        optim.step()
+    
+        for name, param in MlpTest.model.named_parameters():
+            if not param.stop_gradient:
+                self.assertIsNotNone(param.gradient())
+                self.assertNotEqual(0, np.sum(param.gradient()**2))
+    
+    #@unittest.skip('skip for debug')
+    def test_embeddings(self):
+        embed = PatchEmbedding(embed_dim=768)
+        dummy_img = np.random.randn(4, 3, 224, 224).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+    
+        embed_out = embed(dummy_tensor)
+        self.assertEqual(embed_out.shape, [4, 3136, 768])
+
+    #@unittest.skip('skip for debug')
+    def test_gmlp(self):
+        mlp_op = GMlp(768, 256, None, 0.0)
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+        out = mlp_op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+
+        mlp_op = GMlp(768, 256, partial(SpatialGatingUnit, seq_len=50), 0.0)
+        out = mlp_op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+
+
+    #@unittest.skip('skip for debug')
+    def test_identity(self):
+        op = Identity()
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+    
+        out = op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+    
+    #@unittest.skip('skip for debug')
+    def test_spatial_gating_block(self):
+        op = SpatialGatingBlock(dim=768, seq_len=50)
+        dummy_img = np.random.randn(4, 50, 768).astype('float32')
+        dummy_tensor = paddle.to_tensor(dummy_img)
+    
+        out = op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 768])
+
+    def test_spatial_gating_unit(self):
+        op = SpatialGatingUnit(dim=768, seq_len=50)
+        dummy_tensor = paddle.ones([4, 50, 768])
+
+        out = op(dummy_tensor)
+        self.assertEqual(out.shape, [4, 50, 384])
diff --git a/image_classification/gMLP/utils.py b/image_classification/gMLP/utils.py
new file mode 100644
index 00000000..44800527
--- /dev/null
+++ b/image_classification/gMLP/utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
diff --git a/object_detection/DETR/FreeMono.ttf b/object_detection/DETR/FreeMono.ttf
new file mode 100644
index 00000000..f88bcef9
Binary files /dev/null and b/object_detection/DETR/FreeMono.ttf differ
diff --git a/object_detection/DETR/auto_augment.py b/object_detection/DETR/auto_augment.py
new file mode 100644
index 00000000..7cbd2dee
--- /dev/null
+++ b/object_detection/DETR/auto_augment.py
@@ -0,0 +1,822 @@
+""" AutoAugment, RandAugment, and AugMix for PyTorch
+
+This code implements the searched ImageNet policies with various tweaks and improvements and
+does not include any of the search code.
+
+AA and RA Implementation adapted from:
+    https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py
+
+AugMix adapted from:
+    https://github.com/google-research/augmix
+
+Papers:
+    AutoAugment: Learning Augmentation Policies from Data - https://arxiv.org/abs/1805.09501
+    Learning Data Augmentation Strategies for Object Detection - https://arxiv.org/abs/1906.11172
+    RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719
+    AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty - https://arxiv.org/abs/1912.02781
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import random
+import math
+import re
+from PIL import Image, ImageOps, ImageEnhance, ImageChops
+import PIL
+import numpy as np
+
+
+_PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]])
+
+_FILL = (128, 128, 128)
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+_HPARAMS_DEFAULT = dict(
+    translate_const=250,
+    img_mean=_FILL,
+)
+
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
+
+
+def _interpolation(kwargs):
+    interpolation = kwargs.pop('resample', Image.BILINEAR)
+    if isinstance(interpolation, (list, tuple)):
+        return random.choice(interpolation)
+    else:
+        return interpolation
+
+
+def _check_args_tf(kwargs):
+    if 'fillcolor' in kwargs and _PIL_VER < (5, 0):
+        kwargs.pop('fillcolor')
+    kwargs['resample'] = _interpolation(kwargs)
+
+
+def shear_x(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs)
+
+
+def shear_y(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs)
+
+
+def translate_x_rel(img, pct, **kwargs):
+    pixels = pct * img.size[0]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
+
+
+def translate_y_rel(img, pct, **kwargs):
+    pixels = pct * img.size[1]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
+
+
+def translate_x_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
+
+
+def translate_y_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
+
+
+def rotate(img, degrees, **kwargs):
+    _check_args_tf(kwargs)
+    if _PIL_VER >= (5, 2):
+        return img.rotate(degrees, **kwargs)
+    elif _PIL_VER >= (5, 0):
+        w, h = img.size
+        post_trans = (0, 0)
+        rotn_center = (w / 2.0, h / 2.0)
+        angle = -math.radians(degrees)
+        matrix = [
+            round(math.cos(angle), 15),
+            round(math.sin(angle), 15),
+            0.0,
+            round(-math.sin(angle), 15),
+            round(math.cos(angle), 15),
+            0.0,
+        ]
+
+        def transform(x, y, matrix):
+            (a, b, c, d, e, f) = matrix
+            return a * x + b * y + c, d * x + e * y + f
+
+        matrix[2], matrix[5] = transform(
+            -rotn_center[0] - post_trans[0], -rotn_center[1] - post_trans[1], matrix
+        )
+        matrix[2] += rotn_center[0]
+        matrix[5] += rotn_center[1]
+        return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
+    else:
+        return img.rotate(degrees, resample=kwargs['resample'])
+
+
+def auto_contrast(img, **__):
+    return ImageOps.autocontrast(img)
+
+
+def invert(img, **__):
+    return ImageOps.invert(img)
+
+
+def equalize(img, **__):
+    return ImageOps.equalize(img)
+
+
+def solarize(img, thresh, **__):
+    return ImageOps.solarize(img, thresh)
+
+
+def solarize_add(img, add, thresh=128, **__):
+    lut = []
+    for i in range(256):
+        if i < thresh:
+            lut.append(min(255, i + add))
+        else:
+            lut.append(i)
+    if img.mode in ("L", "RGB"):
+        if img.mode == "RGB" and len(lut) == 256:
+            lut = lut + lut + lut
+        return img.point(lut)
+    else:
+        return img
+
+
+def posterize(img, bits_to_keep, **__):
+    if bits_to_keep >= 8:
+        return img
+    return ImageOps.posterize(img, bits_to_keep)
+
+
+def contrast(img, factor, **__):
+    return ImageEnhance.Contrast(img).enhance(factor)
+
+
+def color(img, factor, **__):
+    return ImageEnhance.Color(img).enhance(factor)
+
+
+def brightness(img, factor, **__):
+    return ImageEnhance.Brightness(img).enhance(factor)
+
+
+def sharpness(img, factor, **__):
+    return ImageEnhance.Sharpness(img).enhance(factor)
+
+
+def _randomly_negate(v):
+    """With 50% prob, negate the value"""
+    return -v if random.random() > 0.5 else v
+
+
+def _rotate_level_to_arg(level, _hparams):
+    # range [-30, 30]
+    level = (level / _MAX_LEVEL) * 30.
+    level = _randomly_negate(level)
+    return level,
+
+
+def _enhance_level_to_arg(level, _hparams):
+    # range [0.1, 1.9]
+    return (level / _MAX_LEVEL) * 1.8 + 0.1,
+
+
+def _enhance_increasing_level_to_arg(level, _hparams):
+    # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend
+    # range [0.1, 1.9]
+    level = (level / _MAX_LEVEL) * .9
+    level = 1.0 + _randomly_negate(level)
+    return level,
+
+
+def _shear_level_to_arg(level, _hparams):
+    # range [-0.3, 0.3]
+    level = (level / _MAX_LEVEL) * 0.3
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_abs_level_to_arg(level, hparams):
+    translate_const = hparams['translate_const']
+    level = (level / _MAX_LEVEL) * float(translate_const)
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_rel_level_to_arg(level, hparams):
+    # default range [-0.45, 0.45]
+    translate_pct = hparams.get('translate_pct', 0.45)
+    level = (level / _MAX_LEVEL) * translate_pct
+    level = _randomly_negate(level)
+    return level,
+
+
+def _posterize_level_to_arg(level, _hparams):
+    # As per Tensorflow TPU EfficientNet impl
+    # range [0, 4], 'keep 0 up to 4 MSB of original image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4),
+
+
+def _posterize_increasing_level_to_arg(level, hparams):
+    # As per Tensorflow models research and UDA impl
+    # range [4, 0], 'keep 4 down to 0 MSB of original image',
+    # intensity/severity of augmentation increases with level
+    return 4 - _posterize_level_to_arg(level, hparams)[0],
+
+
+def _posterize_original_level_to_arg(level, _hparams):
+    # As per original AutoAugment paper description
+    # range [4, 8], 'keep 4 up to 8 MSB of image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4) + 4,
+
+
+def _solarize_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 256),
+
+
+def _solarize_increasing_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation increases with level
+    return 256 - _solarize_level_to_arg(level, _hparams)[0],
+
+
+def _solarize_add_level_to_arg(level, _hparams):
+    # range [0, 110]
+    return int((level / _MAX_LEVEL) * 110),
+
+
+LEVEL_TO_ARG = {
+    'AutoContrast': None,
+    'Equalize': None,
+    'Invert': None,
+    'Rotate': _rotate_level_to_arg,
+    # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers
+    'Posterize': _posterize_level_to_arg,
+    'PosterizeIncreasing': _posterize_increasing_level_to_arg,
+    'PosterizeOriginal': _posterize_original_level_to_arg,
+    'Solarize': _solarize_level_to_arg,
+    'SolarizeIncreasing': _solarize_increasing_level_to_arg,
+    'SolarizeAdd': _solarize_add_level_to_arg,
+    'Color': _enhance_level_to_arg,
+    'ColorIncreasing': _enhance_increasing_level_to_arg,
+    'Contrast': _enhance_level_to_arg,
+    'ContrastIncreasing': _enhance_increasing_level_to_arg,
+    'Brightness': _enhance_level_to_arg,
+    'BrightnessIncreasing': _enhance_increasing_level_to_arg,
+    'Sharpness': _enhance_level_to_arg,
+    'SharpnessIncreasing': _enhance_increasing_level_to_arg,
+    'ShearX': _shear_level_to_arg,
+    'ShearY': _shear_level_to_arg,
+    'TranslateX': _translate_abs_level_to_arg,
+    'TranslateY': _translate_abs_level_to_arg,
+    'TranslateXRel': _translate_rel_level_to_arg,
+    'TranslateYRel': _translate_rel_level_to_arg,
+}
+
+
+NAME_TO_OP = {
+    'AutoContrast': auto_contrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Rotate': rotate,
+    'Posterize': posterize,
+    'PosterizeIncreasing': posterize,
+    'PosterizeOriginal': posterize,
+    'Solarize': solarize,
+    'SolarizeIncreasing': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'ColorIncreasing': color,
+    'Contrast': contrast,
+    'ContrastIncreasing': contrast,
+    'Brightness': brightness,
+    'BrightnessIncreasing': brightness,
+    'Sharpness': sharpness,
+    'SharpnessIncreasing': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x_abs,
+    'TranslateY': translate_y_abs,
+    'TranslateXRel': translate_x_rel,
+    'TranslateYRel': translate_y_rel,
+}
+
+
+class AugmentOp:
+
+    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
+        hparams = hparams or _HPARAMS_DEFAULT
+        self.aug_fn = NAME_TO_OP[name]
+        self.level_fn = LEVEL_TO_ARG[name]
+        self.prob = prob
+        self.magnitude = magnitude
+        self.hparams = hparams.copy()
+        self.kwargs = dict(
+            fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL,
+            resample=hparams['interpolation'] if 'interpolation' in hparams else _RANDOM_INTERPOLATION,
+        )
+
+        # If magnitude_std is > 0, we introduce some randomness
+        # in the usually fixed policy and sample magnitude from a normal distribution
+        # with mean `magnitude` and std-dev of `magnitude_std`.
+        # NOTE This is my own hack, being tested, not in papers or reference impls.
+        # If magnitude_std is inf, we sample magnitude from a uniform distribution
+        self.magnitude_std = self.hparams.get('magnitude_std', 0)
+
+    def __call__(self, img):
+        if self.prob < 1.0 and random.random() > self.prob:
+            return img
+        magnitude = self.magnitude
+        if self.magnitude_std:
+            if self.magnitude_std == float('inf'):
+                magnitude = random.uniform(0, magnitude)
+            elif self.magnitude_std > 0:
+                magnitude = random.gauss(magnitude, self.magnitude_std)
+        magnitude = min(_MAX_LEVEL, max(0, magnitude))  # clip to valid range
+        level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple()
+        return self.aug_fn(img, *level_args, **self.kwargs)
+
+
+def auto_augment_policy_v0(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, cannot find a paper reference.
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],  # This results in black image with Tpu posterize
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_v0r(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, with variation of Posterize used
+    # in Google research implementation (number of bits discarded increases with magnitude)
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('PosterizeIncreasing', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('PosterizeIncreasing', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_original(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501
+    policy = [
+        [('PosterizeOriginal', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('PosterizeOriginal', 0.6, 7), ('PosterizeOriginal', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('PosterizeOriginal', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('PosterizeOriginal', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_originalr(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501 with research posterize variation
+    policy = [
+        [('PosterizeIncreasing', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('PosterizeIncreasing', 0.6, 7), ('PosterizeIncreasing', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('PosterizeIncreasing', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('PosterizeIncreasing', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy(name='v0', hparams=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    if name == 'original':
+        return auto_augment_policy_original(hparams)
+    elif name == 'originalr':
+        return auto_augment_policy_originalr(hparams)
+    elif name == 'v0':
+        return auto_augment_policy_v0(hparams)
+    elif name == 'v0r':
+        return auto_augment_policy_v0r(hparams)
+    else:
+        assert False, 'Unknown AA policy (%s)' % name
+
+
+class AutoAugment:
+
+    def __init__(self, policy):
+        self.policy = policy
+
+    def __call__(self, img):
+        sub_policy = random.choice(self.policy)
+        for op in sub_policy:
+            img = op(img)
+        return img
+
+
+def auto_augment_transform(config_str, hparams):
+    """
+    Create a AutoAugment transform
+
+    :param config_str: String defining configuration of auto augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the AutoAugment policy (one of 'v0', 'v0r', 'original', 'originalr').
+    The remaining sections, not order sepecific determine
+        'mstd' -  float std deviation of magnitude noise applied
+    Ex 'original-mstd0.5' results in AutoAugment with original policy, magnitude_std 0.5
+
+    :param hparams: Other hparams (kwargs) for the AutoAugmentation scheme
+
+    :return: A PyTorch compatible Transform
+    """
+    config = config_str.split('-')
+    policy_name = config[0]
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        else:
+            assert False, 'Unknown AutoAugment config section'
+    aa_policy = auto_augment_policy(policy_name, hparams=hparams)
+    return AutoAugment(aa_policy)
+
+
+_RAND_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'Posterize',
+    'Solarize',
+    'SolarizeAdd',
+    'Color',
+    'Contrast',
+    'Brightness',
+    'Sharpness',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'  # NOTE I've implement this as random erasing separately
+]
+
+
+_RAND_INCREASING_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'PosterizeIncreasing',
+    'SolarizeIncreasing',
+    'SolarizeAdd',
+    'ColorIncreasing',
+    'ContrastIncreasing',
+    'BrightnessIncreasing',
+    'SharpnessIncreasing',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'  # NOTE I've implement this as random erasing separately
+]
+
+
+
+# These experimental weights are based loosely on the relative improvements mentioned in paper.
+# They may not result in increased performance, but could likely be tuned to so.
+_RAND_CHOICE_WEIGHTS_0 = {
+    'Rotate': 0.3,
+    'ShearX': 0.2,
+    'ShearY': 0.2,
+    'TranslateXRel': 0.1,
+    'TranslateYRel': 0.1,
+    'Color': .025,
+    'Sharpness': 0.025,
+    'AutoContrast': 0.025,
+    'Solarize': .005,
+    'SolarizeAdd': .005,
+    'Contrast': .005,
+    'Brightness': .005,
+    'Equalize': .005,
+    'Posterize': 0,
+    'Invert': 0,
+}
+
+
+def _select_rand_weights(weight_idx=0, transforms=None):
+    transforms = transforms or _RAND_TRANSFORMS
+    assert weight_idx == 0  # only one set of weights currently
+    rand_weights = _RAND_CHOICE_WEIGHTS_0
+    probs = [rand_weights[k] for k in transforms]
+    probs /= np.sum(probs)
+    return probs
+
+
+def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _RAND_TRANSFORMS
+    return [AugmentOp(
+        name, prob=0.5, magnitude=magnitude, hparams=hparams) for name in transforms]
+
+
+class RandAugment:
+    def __init__(self, ops, num_layers=2, choice_weights=None):
+        self.ops = ops
+        self.num_layers = num_layers
+        self.choice_weights = choice_weights
+
+    def __call__(self, img):
+        # no replacement when using weighted choice
+        ops = np.random.choice(
+            self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights)
+        for op in ops:
+            img = op(img)
+        return img
+
+
+def rand_augment_transform(config_str, hparams):
+    """
+    Create a RandAugment transform
+
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude of rand augment
+        'n' - integer num layers (number of transform ops selected per image)
+        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
+        'mstd' -  float std deviation of magnitude noise applied
+        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
+    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
+    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2
+
+    :param hparams: Other hparams (kwargs) for the RandAugmentation scheme
+
+    :return: A PyTorch compatible Transform
+    """
+    magnitude = _MAX_LEVEL  # default to _MAX_LEVEL for magnitude (currently 10)
+    num_layers = 2  # default to 2 ops per image
+    weight_idx = None  # default to no probability weights for op choice
+    transforms = _RAND_TRANSFORMS
+    config = config_str.split('-')
+    assert config[0] == 'rand'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'inc':
+            if bool(val):
+                transforms = _RAND_INCREASING_TRANSFORMS
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'n':
+            num_layers = int(val)
+        elif key == 'w':
+            weight_idx = int(val)
+        else:
+            assert False, 'Unknown RandAugment config section'
+    ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams, transforms=transforms)
+    choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx)
+    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)
+
+
+_AUGMIX_TRANSFORMS = [
+    'AutoContrast',
+    'ColorIncreasing',  # not in paper
+    'ContrastIncreasing',  # not in paper
+    'BrightnessIncreasing',  # not in paper
+    'SharpnessIncreasing',  # not in paper
+    'Equalize',
+    'Rotate',
+    'PosterizeIncreasing',
+    'SolarizeIncreasing',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+]
+
+
+def augmix_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _AUGMIX_TRANSFORMS
+    return [AugmentOp(
+        name, prob=1.0, magnitude=magnitude, hparams=hparams) for name in transforms]
+
+
+class AugMixAugment:
+    """ AugMix Transform
+    Adapted and improved from impl here: https://github.com/google-research/augmix/blob/master/imagenet.py
+    From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty -
+    https://arxiv.org/abs/1912.02781
+    """
+    def __init__(self, ops, alpha=1., width=3, depth=-1, blended=False):
+        self.ops = ops
+        self.alpha = alpha
+        self.width = width
+        self.depth = depth
+        self.blended = blended  # blended mode is faster but not well tested
+
+    def _calc_blended_weights(self, ws, m):
+        ws = ws * m
+        cump = 1.
+        rws = []
+        for w in ws[::-1]:
+            alpha = w / cump
+            cump *= (1 - alpha)
+            rws.append(alpha)
+        return np.array(rws[::-1], dtype=np.float32)
+
+    def _apply_blended(self, img, mixing_weights, m):
+        # This is my first crack and implementing a slightly faster mixed augmentation. Instead
+        # of accumulating the mix for each chain in a Numpy array and then blending with original,
+        # it recomputes the blending coefficients and applies one PIL image blend per chain.
+        # TODO the results appear in the right ballpark but they differ by more than rounding.
+        img_orig = img.copy()
+        ws = self._calc_blended_weights(mixing_weights, m)
+        for w in ws:
+            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
+            ops = np.random.choice(self.ops, depth, replace=True)
+            img_aug = img_orig  # no ops are in-place, deep copy not necessary
+            for op in ops:
+                img_aug = op(img_aug)
+            img = Image.blend(img, img_aug, w)
+        return img
+
+    def _apply_basic(self, img, mixing_weights, m):
+        # This is a literal adaptation of the paper/official implementation without normalizations and
+        # PIL <-> Numpy conversions between every op. It is still quite CPU compute heavy compared to the
+        # typical augmentation transforms, could use a GPU / Kornia implementation.
+        img_shape = img.size[0], img.size[1], len(img.getbands())
+        mixed = np.zeros(img_shape, dtype=np.float32)
+        for mw in mixing_weights:
+            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
+            ops = np.random.choice(self.ops, depth, replace=True)
+            img_aug = img  # no ops are in-place, deep copy not necessary
+            for op in ops:
+                img_aug = op(img_aug)
+            mixed += mw * np.asarray(img_aug, dtype=np.float32)
+        np.clip(mixed, 0, 255., out=mixed)
+        mixed = Image.fromarray(mixed.astype(np.uint8))
+        return Image.blend(img, mixed, m)
+
+    def __call__(self, img):
+        mixing_weights = np.float32(np.random.dirichlet([self.alpha] * self.width))
+        m = np.float32(np.random.beta(self.alpha, self.alpha))
+        if self.blended:
+            mixed = self._apply_blended(img, mixing_weights, m)
+        else:
+            mixed = self._apply_basic(img, mixing_weights, m)
+        return mixed
+
+
+def augment_and_mix_transform(config_str, hparams):
+    """ Create AugMix PyTorch transform
+
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude (severity) of augmentation mix (default: 3)
+        'w' - integer width of augmentation chain (default: 3)
+        'd' - integer depth of augmentation chain (-1 is random [1, 3], default: -1)
+        'b' - integer (bool), blend each branch of chain into end result without a final blend, less CPU (default: 0)
+        'mstd' -  float std deviation of magnitude noise applied (default: 0)
+    Ex 'augmix-m5-w4-d2' results in AugMix with severity 5, chain width 4, chain depth 2
+
+    :param hparams: Other hparams (kwargs) for the Augmentation transforms
+
+    :return: A PyTorch compatible Transform
+    """
+    magnitude = 3
+    width = 3
+    depth = -1
+    alpha = 1.
+    blended = False
+    hparams['magnitude_std'] = float('inf')
+    config = config_str.split('-')
+    assert config[0] == 'augmix'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'w':
+            width = int(val)
+        elif key == 'd':
+            depth = int(val)
+        elif key == 'a':
+            alpha = float(val)
+        elif key == 'b':
+            blended = bool(val)
+        else:
+            assert False, 'Unknown AugMix config section'
+    ops = augmix_ops(magnitude=magnitude, hparams=hparams)
+    return AugMixAugment(ops, alpha=alpha, width=width, depth=depth, blended=blended)
diff --git a/object_detection/DETR/backbone.py b/object_detection/DETR/backbone.py
new file mode 100644
index 00000000..2bea41eb
--- /dev/null
+++ b/object_detection/DETR/backbone.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Backbone related classes and methods for DETR
+Backbone now supports ResNet50, and ResNet101
+'build_backbone' method returns resnet with position_embedding
+ResNet is implemented in ./resnet.py
+"""
+
+from collections import OrderedDict
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from position_embedding import build_position_encoding
+import resnet
+from utils import NestedTensor
+
+
+class IntermediateLayerGetter(nn.LayerDict):
+    """ Run inference and return outputs from selected layers
+
+    This class stores the layers needed for inferening all the selected
+    return layers, layers after the last return layer will be ignored.
+    Forward method returns a dict with layer names and corresponding output tensors
+
+    Arguments:
+        model: nn.Layer, backbone model, e.g., resnet50
+        return_layers:  dict, dict of return layers
+    """
+
+    def __init__(self, model, return_layers):
+        #print([name for name, _ in model.named_children()])
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError('return_layers are not present in model')
+        orig_return_layers = return_layers
+        # copy return_layers is required, otherwise orig_return_layers will be empty
+        return_layers = {k:v for k, v in return_layers.items()}
+        layers = OrderedDict()
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+        super(IntermediateLayerGetter, self).__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        out = OrderedDict()
+        for name, module in self.named_children():
+            x = module(x)
+            #print(f'--------{name}-------------')
+            #print(x)
+            if name in self.return_layers:
+                out_name = self.return_layers[name]
+                out[out_name] = x
+        return out
+
+
+class FrozenBatchNorm2D(nn.Layer):
+    """Freeze the bn layer without learning and updating.
+
+    This layer can be replaced with nn.BatchNorm2D, in order to freeze
+    the learning, usually is used in backbone during the training.
+    Weights and params are same as nn.BatchNorm2D, now eps is set to 1e-5
+    """
+
+    def __init__(self, n):
+        super(FrozenBatchNorm2D, self).__init__()
+        self.register_buffer('weight', paddle.ones([n]))
+        self.register_buffer('bias', paddle.zeros([n]))
+        self.register_buffer('_mean', paddle.zeros([n]))
+        self.register_buffer('_variance', paddle.ones([n]))
+
+    def forward(self, x):
+        w = self.weight.reshape([1, -1, 1, 1])
+        b = self.bias.reshape([1, -1, 1, 1])
+        rv = self._variance.reshape([1, -1, 1, 1])
+        rm = self._mean.reshape([1, -1, 1, 1])
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+
+class BackboneBase(nn.Layer):
+    """Backbone Base class for NestedTensor input and multiple outputs
+
+    This class handles the NestedTensor as input, run inference through backbone
+    and return multiple output tensors(NestedTensors) from selected layers
+    """
+
+    def __init__(self,
+                 backbone: nn.Layer,
+                 train_backbone: bool,
+                 num_channels: int,
+                 return_interm_layers: bool):
+        super().__init__()
+        if return_interm_layers:
+            return_layers = {'layer0': '0', 'layer2': '1', 'layer3':'2', 'layer4':'3'}
+        else:
+            return_layers = {'layer4': '0'}
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+
+    def forward(self, tensor_list):
+        #Inference through resnet backbone, which takes the paddle.Tensor as input
+        #tensor_list contains .tensor(paddle.Tensor) and .mask(paddle.Tensor) for batch inputs
+        xs = self.body(tensor_list.tensors)
+        out = {}
+        for name, x in xs.items():
+            # x.shape: [batch_size, feat_dim, feat_h, feat_w]
+            m = tensor_list.mask  # [batch_size, orig_h, orig_w]
+            assert m is not None
+            m = m.unsqueeze(0).astype('float32') # [1, batch_size, orig_h, orig_w]
+            mask = F.interpolate(m, size=x.shape[-2:])[0] #[batch_size, feat_h, fea_w]
+            out[name] = NestedTensor(x, mask)
+        return out
+
+
+class Backbone(BackboneBase):
+    """Get resnet backbone from resnet.py with multiple settings and return BackboneBase instance"""
+    def __init__(self, name, train_backbone, return_interm_layers, dilation):
+        backbone = getattr(resnet, name)(pretrained=paddle.distributed.get_rank() == 0,
+                                         norm_layer=FrozenBatchNorm2D,
+                                         replace_stride_with_dilation=[False, False, dilation])
+        num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
+        super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
+
+
+class Joiner(nn.Sequential):
+    """ Joiner layers(nn.Sequential) for backbone and pos_embed
+    Arguments:
+        backbone: nn.Layer, backbone layer (resnet)
+        position_embedding: nn.Layer, position_embedding(learned, or sine)
+    """
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+
+    def forward(self, x):
+        # feature from resnet backbone inference
+        xs = self[0](x)
+        out = []
+        pos = []
+        # for each backbone output, apply position embedding
+        for name, xx in xs.items():
+            out.append(xx)
+            pos.append(self[1](xx).astype(xx.tensors.dtype))
+            #print(f'----- {name} pos: ---------')
+            #print(pos[-1])
+        return out, pos
+
+
+def build_backbone(config):
+    """ build resnet backbone and position embedding according to config """
+    assert config.MODEL.BACKBONE in ['resnet50', 'resnet101'], "backbone name is not supported!"
+    backbone_name = config.MODEL.BACKBONE
+    dilation = False
+    train_backbone = not config.EVAL
+    return_interm_layers = False #TODO: impl case True for segmentation
+
+    position_embedding = build_position_encoding(config.MODEL.TRANS.HIDDEN_SIZE)
+    backbone = Backbone(backbone_name, train_backbone, return_interm_layers, dilation)
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = backbone.num_channels
+
+    return model
diff --git a/object_detection/DETR/box_ops.py b/object_detection/DETR/box_ops.py
new file mode 100644
index 00000000..921760bc
--- /dev/null
+++ b/object_detection/DETR/box_ops.py
@@ -0,0 +1,141 @@
+import paddle
+
+def box_cxcywh_to_xyxy(box):
+    """convert box from center-size format:
+    [center_x, center_y, width, height]
+    to top-left/bottom-right format:
+    [x0, y0, x1, y1]
+
+    Args:
+        box: paddle.Tensor, last_dim=4, stores center-size format boxes
+    Return:
+        paddle.Tensor, last_dim=4, top-left/bottom-right format boxes
+    """
+
+    x_c, y_c, w, h = box.unbind(-1)
+    x0 = x_c - 0.5 * w
+    y0 = y_c - 0.5 * h
+    x1 = x_c + 0.5 * w
+    y1 = y_c + 0.5 * h
+    return paddle.stack([x0, y0, x1, y1], axis=-1)
+
+
+def box_xyxy_to_cxcywh(box):
+    """convert box from top-left/bottom-right format:
+    [x0, y0, x1, y1]
+    to center-size format:
+    [center_x, center_y, width, height]
+
+    Args:
+        box: paddle.Tensor, last_dim=4, stop-left/bottom-right format boxes
+    Return:
+        paddle.Tensor, last_dim=4, center-size format boxes
+    """
+
+    x0, y0, x1, y1 = box.unbind(-1)
+    xc = x0 + (x1-x0)/2
+    yc = y0 + (y1-y0)/2
+    w = x1 - x0
+    h = y1 - y0
+    return paddle.stack([xc, yc, w, h], axis=-1)
+
+
+def box_area(boxes):
+    """ compute area of a set of boxes in (x1, y1, x2, y2) format
+    Args:
+        boxes: paddle.Tensor, shape = Nx4, must in (x1, y1, x2, y2) format
+    Return:
+        areas: paddle.Tensor, N, areas of each box
+    """
+
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def box_iou(boxes1, boxes2):
+    """compute iou of 2 sets of boxes in (x1, y1, x2, y2) format
+
+    This method returns the iou between every pair of boxes 
+    in two sets of boxes.
+
+    Args:
+        boxes1: paddle.Tensor, shape=N x 4, boxes are stored in (x1, y1, x2, y2) format
+        boxes2: paddle.Tensor, shape=N x 4, boxes are stored in (x1, y1, x2, y2) format
+    Return:
+        iou: iou ratios between each pair of boxes in boxes1 and boxes2
+        union: union areas between each pair of boxes in boxes1 and boxes2
+    """
+
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    boxes1 = boxes1.unsqueeze(1) # N x 1 x 4
+    lt = paddle.maximum(boxes1[:, :, :2], boxes2[:, :2])
+    rb = paddle.minimum(boxes1[:, :, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clip(min=0)
+    inter = wh[:, :, 0] * wh[:, :, 1]
+
+    union = area1.unsqueeze(1) + area2 - inter # broadcast
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """Compute GIoU of each pais in boxes1 and boxes2 
+
+    GIoU = IoU - |A_c - U| / |A_c|
+    where A_c is the smallest convex hull that encloses both boxes, U is the union of boxes
+    Details illustrations can be found in https://giou.stanford.edu/
+
+    Args:
+        boxes1: paddle.Tensor, shape=N x 4, boxes are stored in (x1, y1, x2, y2) format
+        boxes2: paddle.Tensor, shape=N x 4, boxes are stored in (x1, y1, x2, y2) format
+    Return:
+        giou: giou ratios between each pair of boxes in boxes1 and boxes2
+    """
+
+    iou, union = box_iou(boxes1, boxes2)
+
+    boxes1 = boxes1.unsqueeze(1) # N x 1 x 4
+    lt = paddle.minimum(boxes1[:, :, :2], boxes2[:, :2])
+    rb = paddle.maximum(boxes1[:, :, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clip(min=0)
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area-union) / area
+
+
+def masks_to_boxes(masks):
+    """convert masks to bboxes
+
+    Args:
+        masks: paddle.Tensor, NxHxW
+    Return:
+        boxes: paddle.Tensor, Nx4
+    """
+
+    if masks.numel() == 0:
+        return paddle.zeros((0, 4))
+    h, w = masks.shape[-2:]
+    y = paddle.arange(0, h, dtype='float32')
+    x = paddle.arange(0, w, dtype='float32')
+    y, x = paddle.meshgrid(y, x)
+
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+
+    #x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)
+    x_min = paddle.where(masks == 0, paddle.ones_like(x_mask)*float(1e8), x_mask)
+    x_min = x_min.flatten(1).min(-1)[0]
+
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    #y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+    y_min = paddle.where(masks == 0, paddle.ones_like(y_mask) * float(1e8), y_mask)
+    y_min = y_min.flatten(1).min(-1)[0]
+
+    return paddle.stack([x_min, y_min, x_max, y_max], 1)
+
+
diff --git a/object_detection/DETR/coco.py b/object_detection/DETR/coco.py
new file mode 100644
index 00000000..90a3ff54
--- /dev/null
+++ b/object_detection/DETR/coco.py
@@ -0,0 +1,268 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset(COCO2017) related classes and methods for DETR training and validation
+"""
+
+import os
+import numpy as np
+from PIL import Image
+import paddle
+from pycocotools.coco import COCO
+from pycocotools import mask as coco_mask
+import transforms as T
+from utils import collate_fn
+
+
+class CocoDetection(paddle.io.Dataset):
+    """ COCO Detection dataset
+
+    This class gets images and annotations for paddle training and validation.
+    Transform(preprocessing) can be applied in __getitem__ method.
+
+    Attributes:
+        img_folder: str, file path of coco images, e.g.{COCO_PATH}/train2017
+        anno_file: str, path where annotation json file is stored
+        transforms: transforms applied on data, see make_coco_transform for details
+        return_masks: bool, if true, return coco masks, default: False (now only support False)
+    """
+
+    def __init__(self, img_folder, anno_file, transforms, return_masks):
+        super(CocoDetection, self).__init__()
+        self.coco = COCO(anno_file)
+        # coco all image ids
+        ids = list(sorted(self.coco.imgs.keys()))
+        # remove ids where anno has no bboxes
+        self.ids = self._remove_images_without_annotations(ids)
+        self._transforms = transforms
+        # Prepare has __call__ method takes image and target as inputs
+        # and applies label filtering and output image and label as paddle tensors
+        self.prepare = ConvertCocoPolysToMasks(return_masks)
+        self.root = img_folder
+
+    def _remove_images_without_annotations(self, ids):
+        new_ids = []
+        rm_cnt = 0
+        for idx in ids:
+            annos = self._load_target(idx)
+            boxes = []
+            for anno in annos:
+                if 'bbox' in anno:
+                    boxes.append(anno['bbox'])
+            if len(boxes) == 0:
+                rm_cnt += 1
+                continue
+            new_ids.append(idx)
+        print(f'loading coco data, {rm_cnt} imgs without annos are removed')
+        return new_ids
+
+    def _load_image(self, idx):
+        """ Return PIL Image (RGB) according to COCO image id"""
+        path = self.coco.loadImgs(idx)[0]['file_name']
+        return Image.open(os.path.join(self.root, path)).convert('RGB')
+
+    def _load_target(self, idx):
+        """ Return image annos according to COCO image id"""
+        return self.coco.loadAnns(self.coco.getAnnIds(idx))
+
+    def __len__(self):
+        return len(self.ids)
+
+    def __getitem__(self, idx):
+        """idx is for training image id, not COCO image id"""
+        image_id = self.ids[idx]
+        image = self._load_image(image_id)
+        target = self._load_target(image_id)
+        target = {'image_id': image_id, 'annotations': target}
+
+        image, target = self.prepare(image, target)
+        if self._transforms is not None:
+            image, target = self._transforms(image, target)
+        return image, target
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    """ Convert coco anno from polygons to image masks"""
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        # paddle any only support bool type
+        mask = paddle.to_tensor(mask, dtype='bool') # w x h x 1
+        mask = mask.any(axis=2).squeeze(-1) # w x h
+        # paddle stack does not support bool type
+        mask = mask.astype('int32')
+        masks.append(mask)
+    if masks:
+        masks = paddle.stack(masks, axis=0)
+    else:
+        mask = paddle.zeros((0, height, width), dtype='int32')
+    return masks
+
+
+class ConvertCocoPolysToMasks():
+    """ Prepare coco annotations to paddle tensors"""
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+
+    def __call__(self, image, target):
+        w, h = image.size
+        image_id = target['image_id']
+        image_id = paddle.to_tensor([image_id])
+
+        anno = target['annotations']
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+
+        boxes = [obj['bbox'] for obj in anno]
+        # Temp Fix: do it in numpy to skip paddl cuda error
+        boxes = np.array(boxes)
+        boxes = boxes.reshape([-1, 4])
+        boxes[:, 2:] += boxes[:, :2] # (n, (x1, y1, x2, y2))
+
+        boxes = paddle.to_tensor(boxes, dtype='float32')
+        # paddle indexing may cause cuda errors
+        #boxes = boxes.reshape([-1, 4]) # (n, (x1, y1, box_w, box_h))
+        #boxes[:, 2:] += boxes[:, :2] # (n, (x1, y1, x2, y2))
+
+        boxes[:, 0::2].clip_(min=0, max=w) # clip bbox inside image
+        boxes[:, 1::2].clip_(min=0, max=h) # clip bbox inside image
+
+        classes = [obj['category_id'] for obj in anno]
+        classes = paddle.to_tensor(classes, dtype='float32')
+
+        if self.return_masks:
+            segmentations = [obj['segmentation'] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)  # [N, H, W] int32 tensor
+
+        keypoints = None
+        if anno and 'keypoints' in anno[0]:
+            keypoints = [obj['keypoints'] for obj in anno]
+            keypoints = paddle.to_tensor(keypoints, dtype='float32')
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.reshape_((num_keypoints, -1, 3))
+
+        #TODO: should be replaced with paddle buildin logical ops in the future
+        boxes_tmp = boxes.cpu().numpy()
+        keep = (boxes_tmp[:, 3] > boxes_tmp[:, 1]) & (boxes_tmp[:, 2] > boxes_tmp[:, 0])
+        keep_idx = np.where(keep)[0].astype('int32')
+        keep = paddle.to_tensor(keep_idx)
+
+        boxes = boxes.index_select(keep, axis=0)
+        classes = classes.index_select(keep, axis=0)
+        if self.return_masks:
+            masks = masks.index_select(keep, axis=0)
+        if keypoints is not None:
+            keypoints = keypoints.index_select(keep, axis=0)
+
+        target = {}
+        target['boxes'] = boxes
+        target['labels'] = classes
+        if self.return_masks:
+            target['masks'] = masks
+        if keypoints is not None:
+            target['keypoints'] = keypoints
+        target['image_id'] = image_id
+
+        area = paddle.to_tensor([obj['area'] for obj in anno])
+        iscrowd = paddle.to_tensor([obj['iscrowd'] if 'iscrowd' in obj else 0 for obj in anno])
+        target['area'] = area
+        target['iscrowd'] = iscrowd.index_select(keep, axis=0)
+
+        target['orig_size'] = paddle.to_tensor([int(h), int(w)])
+        target['size'] = paddle.to_tensor([int(h), int(w)])
+
+        return image, target
+
+
+def make_coco_transforms(image_set):
+    """ return transforms(class defined in ./transforms.py) for coco train and val"""
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=1333),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=1333),
+                ])
+            ),
+            normalize,
+        ])
+
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([800], max_size=1333),
+            normalize,
+        ])
+
+    raise ValueError(f'Unknown {image_set}')
+
+
+def build_coco(image_set, coco_path, masks=False):
+    """Return CocoDetection dataset according to image_set: ['train', 'val']"""
+    assert image_set in ['train', 'val'], f'image_set {image_set} not supported'
+    assert os.path.exists(coco_path), f'provided COCO path {coco_path} does not exist'
+    mode = 'instances'
+    paths = {
+        'train': (os.path.join(coco_path, 'train2017'),
+                  os.path.join(coco_path, 'annotations', f'{mode}_train2017.json')),
+        'val': (os.path.join(coco_path, 'val2017'),
+                os.path.join(coco_path, 'annotations', f'{mode}_val2017.json')),
+    }
+    img_folder, anno_file = paths[image_set]
+    dataset = CocoDetection(img_folder,
+                            anno_file,
+                            transforms=make_coco_transforms(image_set),
+                            return_masks=masks)
+    return dataset
+
+
+def get_dataloader(dataset, batch_size, mode='train', multi_gpu=False):
+    """ return dataloader on train/val set for single/multi gpu
+    Arguments:
+        dataset: paddle.io.Dataset, coco dataset
+        batch_size: int, num of samples in one batch
+        mode: str, ['train', 'val'], dataset to use
+        multi_gpu: bool, if True, DistributedBatchSampler is used for DDP
+    """
+    if multi_gpu:
+        sampler = paddle.io.DistributedBatchSampler(
+            dataset,
+            batch_size=batch_size,
+            shuffle=(mode == 'train'),
+            drop_last=True)
+        #TODO: may need to fix this drop_last of multi-gpu dataloading error
+        # currently, val may drop several samples, which will lower the performance
+        # an idea is to pad the last batch in collate_fn
+        dataloader = paddle.io.DataLoader(dataset,
+                                          batch_sampler=sampler,
+                                          collate_fn=collate_fn)
+    else:
+        dataloader = paddle.io.DataLoader(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'),
+                                          collate_fn=collate_fn)
+    return dataloader
diff --git a/object_detection/DETR/coco_eval.py b/object_detection/DETR/coco_eval.py
new file mode 100644
index 00000000..e108d8ec
--- /dev/null
+++ b/object_detection/DETR/coco_eval.py
@@ -0,0 +1,242 @@
+import os
+import contextlib
+import copy
+import numpy as np
+import paddle
+
+from pycocotools.cocoeval import COCOeval
+from pycocotools.coco import COCO
+import pycocotools.mask as mask_util
+
+from utils import all_gather
+
+class CocoEvaluator():
+    def __init__(self, coco_gt, iou_types):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+
+            with open(os.devnull, 'w') as devnull:
+                with contextlib.redirect_stdout(devnull):
+                    coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            img_ids, eval_imgs = evaluate(coco_eval)
+            #print('eval_imgs shape: ', eval_imgs.shape)
+
+            self.eval_imgs[iou_type].append(eval_imgs)
+
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print(f'IoU metric: {iou_type}')
+            coco_eval.summarize()
+
+    def prepare(self, predictions, iou_type):
+        if iou_type == 'bbox':
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == 'segm':
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == 'keypoints':
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError(f'Unknown iou type {iou_type}')
+
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction['boxes']
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction['scores'].tolist()
+            labels = prediction['labels'].tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        'image_id': original_id,
+                        'category_id': labels[k],
+                        'bbox': box,
+                        'score': scores[k],
+                    }
+                    for k , box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            scores = prediction['scores'].tolist()
+            labels = prediciton['labels'].tolist()
+            masks = prediction['masks']
+            masks = masks > 0.5
+
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order='F'))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle['counts'] = rle['counts'].decode('utf-8')
+
+            coco_results.extend(
+                [
+                    {
+                        'image_id': original_id,
+                        'category_id': labels[k],
+                        'segmentation': rle,
+                        'score': scores[k],
+                    }
+                    for k , rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+        
+
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction['boxes']
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction['scores'].tolist()
+            labels = prediciton['labels'].tolist()
+            keypoints = prediction['keypoints']
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        'image_id': original_id,
+                        'category_id': labels[k],
+                        'keypoints': keypoint,
+                        'score': scores[k],
+                    }
+                    for k , keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+
+
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return paddle.stack((xmin, ymin, xmax - xmin, ymax - ymin), axis=1)
+
+
+def merge(img_ids, eval_imgs):
+    #all_img_ids = [img_ids]
+    #all_eval_imgs = [eval_imgs]
+    all_img_ids = all_gather(img_ids)
+    all_eval_imgs = all_gather(eval_imgs)
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+
+    return merged_img_ids, merged_eval_imgs
+
+
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+
+
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+
+
+def evaluate(self):
+    '''
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    '''
+    # tic = time.time()
+    # print('Running per image evaluation...')
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+        print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+
+
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+
+    if p.iouType == 'segm' or p.iouType == 'bbox':
+        computeIoU = self.computeIoU
+    elif p.iouType == 'keypoints':
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds}
+
+    evaluateImg = self.evaluateImg
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    self._paramsEval = copy.deepcopy(self.params)
+    # toc = time.time()
+    # print('DONE (t={:0.2f}s).'.format(toc-tic))
+    return p.imgIds, evalImgs
+
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################
diff --git a/object_detection/DETR/config.py b/object_detection/DETR/config.py
new file mode 100644
index 00000000..6a8f2979
--- /dev/null
+++ b/object_detection/DETR/config.py
@@ -0,0 +1,134 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+
+"""
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 256 #256 # train batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #64 # val batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/coco/' # path to dataset
+_C.DATA.DATASET = 'coco' # dataset name
+_C.DATA.NUM_WORKERS = 1 # number of data loading threads
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'DETR'
+_C.MODEL.NAME = 'DETR'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 91
+_C.MODEL.DROPOUT = 0.1
+_C.MODEL.ATTENTION_DROPOUT = 0.0
+_C.MODEL.NUM_QUERIES = 100
+
+_C.MODEL.BACKBONE = 'resnet50'
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.HIDDEN_SIZE = 256
+_C.MODEL.TRANS.MLP_DIM = 2048
+_C.MODEL.TRANS.NUM_HEADS = 8
+_C.MODEL.TRANS.NUM_ENCODER_LAYERS = 6
+_C.MODEL.TRANS.NUM_DECODER_LAYERS = 6
+_C.MODEL.TRANS.QKV_BIAS = True
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size
+_C.TRAIN.WEIGHT_DECAY = 0.01 #0.3 # 0.0 for finetune
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0
+_C.TRAIN.END_LR = 1e-5
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 20 # freq to save chpt
+_C.REPORT_FREQ = 50 # freq to logging info
+_C.VALIDATE_FREQ = 20 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.backbone:
+        config.MODEL.BACKBONE = args.backbone
+
+    #config.freeze()
+    return config
+
+
+def get_config():
+    config = _C.clone()
+    return config
+
+
diff --git a/object_detection/DETR/configs/detr_resnet50.yaml b/object_detection/DETR/configs/detr_resnet50.yaml
new file mode 100644
index 00000000..bdcb017c
--- /dev/null
+++ b/object_detection/DETR/configs/detr_resnet50.yaml
@@ -0,0 +1,4 @@
+DATA:
+    BATCH_SIZE: 8 
+
+
diff --git a/object_detection/DETR/datasets.py b/object_detection/DETR/datasets.py
new file mode 100644
index 00000000..6a979516
--- /dev/null
+++ b/object_detection/DETR/datasets.py
@@ -0,0 +1,148 @@
+import os
+import paddle
+from paddle.io import Dataset, DataLoader
+from paddle.vision import transforms, datasets, image_load, set_image_backend
+import numpy as np
+import argparse
+from PIL import Image
+import cv2
+from config import *
+
+class ImageNet1MDataset(Dataset):
+    def __init__(self, file_folder, mode="train", transform=None):
+        super(ImageNet1MDataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode=="train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder,img_path))
+                self.label_list.append(img_label)
+        print(len(self.label_list))
+
+    def __len__(self):
+        return len(self.label_list)
+        
+    def __getitem__(self, index):
+        #print(self.img_path_list[index])
+        #if os.path.isfile(self.img_path_list[index]):
+        #    print('exist')
+        #else:
+        #    print('not exist')
+        #data = Image.open(self.img_path_list[index]).convert('L')
+        #data = cv2.imread(self.img_path_list[index])
+        set_image_backend('cv2')
+        data = image_load(self.img_path_list[index])
+        data = self.transform(data)
+        label = self.label_list[index] 
+
+        return data, label
+        
+
+def get_dataset(config):
+    transform_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.IMAGE_SIZE, config.IMAGE_SIZE), scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+    ]) 
+
+    transform_test = transforms.Compose([
+        transforms.Resize((config.IMAGE_SIZE, config.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+    ]) 
+
+    if config.DATASET == "cifar10":
+        dataset_train = datasets.Cifar10(mode="train", transform=transform_train)
+        dataset_test = datasets.Cifar10(mode="test", transform=transform_test)
+    elif config.DATASET == "cifar100":
+        dataset_train = datasets.Cifar100(mode="train", transform=transform_train)
+        dataset_test = datasets.Cifar100(mode="test", transform=transform_test)
+    elif config.DATASET == "imagenet1m":
+        dataset_train = ImageNet1MDataset(config.DATA_PATH, mode="train", transform=transform_train)
+        dataset_test = ImageNet1MDataset(config.DATA_PATH, mode="val", transform=transform_test)
+    else:
+        raise NotImplementedError("Only cifar10, cifar100, imagenet1m are supported now")
+
+    return dataset_train, dataset_test
+
+
+def get_loader(config, dataset_train, dataset_test=None, multi=False):
+    # multigpu 
+    if multi:
+        sampler_train = paddle.io.DistributedBatchSampler(dataset_train,
+                                                          batch_size=config.BATCH_SIZE,
+                                                          shuffle=True,
+                                                          )
+        dataloader_train = DataLoader(dataset_train, batch_sampler=sampler_train)
+        if dataset_test is not None:
+            sampler_test = paddle.io.DistributedBatchSampler(dataset_test,
+                                                             batch_size=config.BATCH_SIZE,
+                                                             shuffle=False,
+                                                            )
+            dataloader_test = DataLoader(dataset_test, batch_sampler=sampler_test)
+        else:
+            dataloader_test = None
+    else:
+    # single gpu
+        dataloader_train = DataLoader(dataset_train,
+                                      batch_size=config.BATCH_SIZE,
+                                      num_workers=config.NUM_WORKERS,
+                                      shuffle=True,
+                                      #places=paddle.CUDAPlace(0),
+                                      )
+
+        if dataset_test is not None:
+            dataloader_test = DataLoader(dataset_test,
+                                         batch_size=config.BATCH_SIZE,
+                                         num_workers=config.NUM_WORKERS,
+                                         shuffle=False,
+                                         #places=paddle.CUDAPlace(0),
+                                         )
+        else:
+            dataloader_test = None
+
+
+    return dataloader_train, dataloader_test
+
+
+
+
+def main():
+    print('dataset and dataloader')
+    parser = argparse.ArgumentParser('')
+    parser.add_argument('-cfg', type=str, default=None)
+    parser.add_argument('-dataset', type=str, default="imagenet1m")
+    parser.add_argument('-batch_size', type=int, default=256)
+    parser.add_argument('-image_size', type=int, default=224)
+    parser.add_argument('-data_path', type=str, default='/dataset/imagenet/')
+    args = parser.parse_args()
+    print(args)
+
+    config = get_config()
+    config = update_config(config, args)
+    print(config)
+
+    dt_trn, dt_tst = get_dataset(config.DATA)
+    dl_trn, dl_tst = get_loader(config.DATA, dt_trn, dt_tst)
+
+    for idx, (batch_data, batch_label) in enumerate(dl_tst):
+        print(batch_data.shape)
+        print(batch_label)
+        if idx == 10:
+            break
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/object_detection/DETR/detr.py b/object_detection/DETR/detr.py
new file mode 100644
index 00000000..88d16c4c
--- /dev/null
+++ b/object_detection/DETR/detr.py
@@ -0,0 +1,379 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+DETR related classes and methods 
+"""
+
+import paddle 
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from transformer import build_transformer
+from backbone import build_backbone
+from matcher import build_matcher
+from utils import nested_tensor_from_tensor_list
+from segmentation import dice_loss, sigmoid_focal_loss
+from box_ops import generalized_box_iou
+from box_ops import box_cxcywh_to_xyxy
+from box_ops import box_xyxy_to_cxcywh
+
+
+def build_detr(config):
+    """ build detr model from configs"""
+    # 1. build backbone with position embedding
+    backbone = build_backbone(config) 
+    # 2. build transformer (encoders and decoders)
+    transformer = build_transformer(config)
+    # 3. build DETR model
+    aux_loss = not config.EVAL # True if training
+    detr = DETR(backbone, transformer, config.MODEL.NUM_CLASSES, config.MODEL.NUM_QUERIES, aux_loss)
+    # 4. build matcher
+    matcher = build_matcher()
+    # 5. setup aux_loss
+    weight_dict = {'loss_ce': 1, 'loss_bbox': 5, 'loss_giou': 2}
+    if aux_loss:
+        aux_weight_dict = {}
+        for i in range(config.MODEL.TRANS.NUM_DECODER_LAYERS-1):
+            aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+
+    losses = ['labels', 'boxes', 'cardinality']
+    # 6. build criterion
+    criterion = SetCriterion(config.MODEL.NUM_CLASSES,
+                             matcher=matcher,
+                             weight_dict=weight_dict,
+                             eos_coef=0.1,
+                             losses=losses)
+    # 7. build postprocessors
+    postprocessors = {'bbox': PostProcess()}
+
+    return detr, criterion, postprocessors
+
+
+class DETR(nn.Layer):
+    def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False):
+        """ Initialize the model
+        Args:
+            backbone: nn.Layer, paddle module of the backbone.
+            transformer: nn.Layer, paddle module of the transformer.
+            num_classes: int, number of object classes
+            num_queries: int, number of object queries, this is the max number
+                              of objects the DETR can detect in a single image.
+            aux_loss: bool, True if auxiliary decoding losses(loss at each decodr layer) are used
+        """
+
+        super(DETR, self).__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        hidden_dim = transformer.d_model
+        #print('hidden_dim', hidden_dim)
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.class_embed = nn.Linear(hidden_dim,
+                                     num_classes+1,
+                                     weight_attr=w_attr_1,
+                                     bias_attr=b_attr_1)
+
+        self.bbox_embed = MLP(in_dim=hidden_dim,
+                              hidden_dim=hidden_dim,
+                              out_dim=4,
+                              num_layers=3) # different from transformer Mlp
+
+        w_attr_2, _ = self._init_weights()
+        self.query_embed = nn.Embedding(num_queries, 
+                                        hidden_dim,
+                                        weight_attr=w_attr_2)
+        # proj features from resnet to hidden_dim channels
+        self.input_proj = nn.Conv2D(backbone.num_channels, hidden_dim, kernel_size=1)
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+
+    def _init_weights(self):
+        w_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        b_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return w_attr, b_attr
+
+    def forward(self, x):
+        features, pos = self.backbone(x) #resnet + position_embed
+        # decompose NestedTensor to separate 'tensor' and 'mask' tensors
+        src, mask = features[-1].decompose()  # last output layer feature
+        #print('backbone features: ')
+        #print(src, src.shape)
+        #print(mask, mask.shape)
+        #print('|||||||||||||||||||||||||||||||||||')
+        #print(self.query_embed.weight, self.query_embed.weight.shape)
+        #print(pos[-1], pos[-1].shape)
+        #print(self.input_proj(src))
+        src = self.input_proj(src) # proj feature channel to hidden_dim
+        hs = self.transformer(src, mask, self.query_embed.weight, pos[-1])[0] 
+        #print('||||||||||||||HS |||||||||||||||||||||')
+        #print(hs)
+
+        output_class = self.class_embed(hs)
+        output_coord = F.sigmoid(self.bbox_embed(hs))
+        #print('output_class',output_class.shape)
+        #print('output_coord',output_coord.shape)
+        out = {'pred_logits': output_class[-1],
+               'pred_boxes': output_coord[-1]}
+        if self.aux_loss:
+            out['aux_outputs'] = self._set_aux_loss(output_class, output_coord)
+
+        #print("===================== output['pred_logits']============")
+        #print(out['pred_logits'])
+        #print("===================== output['pred_boxes']============")
+        #print(out['pred_boxes'])
+        #print("===================== output['aux_outputs']============")
+        #print(out['aux_outputs'])
+        #print(out)
+        # out:  {'pred_logits': [batch, num_queries, num_classes], 
+        #        'pred_boxes': [batch, num_queries, pos_embed_dim],
+        #        'aux_outputs': outputs {pred_logits:[], pred_boxes:[] }for each dec}
+        return out
+
+    def _set_aux_loss(self, output_class, output_coord):
+        return [{'pred_logits': a, 'pred_boxes': b} for a, b in zip(output_class[:-1], output_coord[:-1])]
+
+
+class SetCriterion(nn.Layer):
+    """ build criterions for DETR"""
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses # [str, str, str]
+        empty_w = paddle.ones([self.num_classes + 1])
+        empty_w[-1] = self.eos_coef
+        self.register_buffer(name='empty_weight', tensor=empty_w)
+
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """Classification loss (NLL)
+        targets dicts must contain 'labels', a tensor of dim [num_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+        idx = self._get_src_permutation_idx(indices)
+        #print('--loss_labels')
+        #print(indices)
+        #print(targets[0]['labels'].index_select(indices[0][1]))
+        #target_classes_o = paddle.concat([t['labels'][J] for t, (_, J) in zip(targets, indices)])
+        target_classes_o = paddle.concat([t['labels'].index_select(J) for t, (_, J) in zip(targets, indices)])
+        target_classes = paddle.full(src_logits.shape[:2], self.num_classes, dtype='int64')
+        target_classes[idx] = target_classes_o
+
+        #print('target_classes= ', target_classes.shape)
+        #print('src_logits = ', src_logits.shape)
+# paddle cross_entropy input is [N1, N2, ... C], last C is num_classes, thus no transpose is needed
+        #print('--------------------------')
+        #print('--------------------------')
+        #print('--------------------------')
+        #print('src_logits: ', src_logits)
+        #print('target_classes: ', target_classes)
+        #print('self.empty_weight: ', self.empty_weight)
+        loss_ce = F.cross_entropy(input=src_logits, label=target_classes, weight=self.empty_weight)
+        #loss_ce = F.cross_entropy(src_logits.transpose([0, 2, 1]), target_classes, self.empty_weight)
+        losses = {'loss_ce': loss_ce}
+
+        return losses
+
+    @paddle.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """ Compute the absolute error in the num of predicted non-empty boxes
+        This is not a real loss, for logging only, so no grad is set.
+        """
+        pred_logits = outputs['pred_logits']
+        tgt_lengths = paddle.to_tensor([len(v['labels']) for v in targets])
+        
+        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1)
+        card_pred = card_pred.astype('float32').sum(1)
+        card_err = F.l1_loss(card_pred.astype('float32'), tgt_lengths.astype('float32'))
+        losses = {'cardinality_error': card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """ Compute bbox loss, the L1 loss and GIoU loss.
+        Targets must contain the 'boxes' key, has a tensor of dim [num_target_boxes, 4]
+        Target boxes are in format cxcywh, normalized by image size
+        """
+        assert 'pred_boxes' in outputs
+        #print('-----loss_boxes')
+        idx = self._get_src_permutation_idx(indices)
+        # idx is a tuple, len(idx) == 2, idx=(batch_idx, src_idx)
+        #print('idx=', idx)
+        
+        src_boxes = paddle.zeros([idx[0].shape[0], 4])
+        for i in range(idx[0].shape[0]):
+            src_boxes[i, :] = outputs['pred_boxes'][idx[0][i],idx[1][i], :]
+
+        #print('src_boxes', src_boxes)
+        #src_boxes = outputs['pred_boxes'].index_select(idx)
+        target_boxes = paddle.concat([t['boxes'].index_select(i) for t, (_, i) in zip(targets, indices)])
+
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses = {}
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - paddle.diag(generalized_box_iou(
+                    box_cxcywh_to_xyxy(src_boxes),
+                    box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """ Compute the mask loss, the focal loss and the dice loss
+        Targets must have 'masks' key, a tensor of dim [num_target_boxes, h, w]
+        """
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs['pred_masks']
+        src_masks = src_masks[src_idx]
+        masks = [t['masks'] for t in targets]
+
+        valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]# upsample
+
+        src_masks = F.interpolate(src_masks.unsqueeze(-1), size=target_masks.shape[-2:],
+                                  mode='bilinear', align_corners=False)
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.reshape(src_masks.shape)
+        losses = {
+            'loss_mask': sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            'loss_dice': dice_loss(src_masks, target_masks, num_boxes)
+            }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        #print('indices------')
+        #print(indices)
+        batch_idx = paddle.concat([paddle.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = paddle.concat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        batch_idx = paddle.concat([paddle.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = paddle.concat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'cardinality': self.loss_cardinality,
+            'boxes': self.loss_boxes,
+            'masks': self.loss_masks
+            }
+        assert loss in loss_map, f"loss {loss} not in loss_map"
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+ 
+    def forward(self, outputs, targets):
+        outputs_without_aux = {k: v for k, v in outputs.items() if k!= 'aux_outputs'}
+        indices = self.matcher(outputs_without_aux, targets) # list of index(tensor) pairs 
+        #print('----------------------- indieces ----------------')
+        #print(indices)
+
+        num_boxes = sum(len(t['labels']) for t in targets)
+        num_boxes = paddle.to_tensor([num_boxes], dtype='float32')
+        # TODO: all_reduce num_boxes is dist is used
+        #dist.all_reduce(num_boxes)
+        num_boxes = paddle.clip(num_boxes / dist.get_world_size(), min=1).item()
+        #print('num_boxes = ', num_boxes)
+
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                #print('aux indices = ', indices)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to comput, ignore then
+                        continue
+                    kwargs = {}
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
+                    #print('l_dict', l_dict)
+                    losses.update(l_dict)
+
+        return losses
+
+
+class PostProcess(nn.Layer):
+    """ This module converts the model's output into the format for coco api"""
+    @paddle.no_grad()
+    def forward(self, outputs, target_sizes):
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+        prob = F.softmax(out_logits, -1) # [batch_size, num_queries, num_classes]
+        #scores, labels = prob[..., :-1].max(-1)
+        scores = prob[:, :, :-1].max(-1)
+        labels = prob[:, :, :-1].argmax(-1)
+        #print('pose process')
+        #print(scores)
+        #print(labels)
+
+        # convert to [x0, y0, x1, y1] format
+        boxes = box_cxcywh_to_xyxy(out_bbox)
+        # from [0, 1] to absolute [0, height] coords
+        img_h, img_w = target_sizes.unbind(1)
+        scale_factor = paddle.stack([img_w, img_h, img_w, img_h], axis=1)
+        scale_factor = scale_factor.unsqueeze(1)
+        boxes = boxes * scale_factor
+
+        results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
+        return results
+
+
+class MLP(nn.Layer):
+    """ Build mlp layers
+
+    Multiple linear layers with ReLU activation(except last linear) applied.
+
+    Args:
+        in_dim:  input feature dim for mlp
+        hidden_dim: input and output dims for middle linear layers
+        out_dim: output dim for last linear layer
+        num_layers: num of linear layers
+    """
+
+    def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
+        super(MLP, self).__init__()
+        self.num_layers = num_layers
+        hidden_dims = [hidden_dim] * (num_layers -1)
+        layer_list = []
+        for idim, odim in zip([in_dim] + hidden_dims, hidden_dims + [out_dim]):
+            w_attr, b_attr= self._init_weights()
+            layer_list.append(
+                nn.Linear(idim, odim, weight_attr=w_attr, bias_attr=b_attr))
+        self.layers = nn.LayerList(layer_list)
+        self.relu = nn.ReLU()
+
+    def _init_weights(self):
+        w_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        b_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return w_attr, b_attr
+
+    def forward(self, x):
+        for idx, layer in enumerate(self.layers):
+            x = layer(x)
+            # last layer no activation
+            if idx < len(self.layers) - 1:
+                x = self.relu(x)
+        return x
+
+
diff --git a/object_detection/DETR/eval.py b/object_detection/DETR/eval.py
new file mode 100644
index 00000000..1e38f914
--- /dev/null
+++ b/object_detection/DETR/eval.py
@@ -0,0 +1,36 @@
+import paddle
+from coco import build_coco
+from coco import get_loader
+from detr import build_detr
+
+def main():
+    model, criterion, postprocessors = build_detr()
+    model_state = paddle.load('./detr_resnet50.pdparams')
+    model.set_dict(model_state)
+    model.eval()
+
+    # 2. Create val dataloader
+    dataset_val = build_coco('val', '/dataset/coco/')
+    dataloader_val = get_loader(dataset_val,
+                                batch_size=4,
+                                mode='val',
+                                multi_gpu=False)
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader_val):
+            samples = data[0]
+            targets = data[1]
+            outputs = model(samples)
+            loss_dict = criterion(outputs, targets)
+            orig_target_sizes = paddle.stack([t['orig_size'] for t in targets], axis=0)
+            results = postprocessors['bbox'](outputs, orig_target_sizes)
+            res = {target['image_id']: output for target, output in zip(targets, results)}
+
+            print(f'{batch_id}/{len(dataloader_val)} done')
+
+    print('all done')
+
+
+
+if __name__ == '__main__':
+    main()
diff --git a/object_detection/DETR/load_pytorch_weights.py b/object_detection/DETR/load_pytorch_weights.py
new file mode 100644
index 00000000..3a09f017
--- /dev/null
+++ b/object_detection/DETR/load_pytorch_weights.py
@@ -0,0 +1,388 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('/root/.cache/torch/hub/facebookresearch_detr_master/util/')
+
+from misc import NestedTensor as ThNestedTensor
+import os
+import argparse
+import numpy as np
+import paddle
+import torch
+#import timm
+#from transformer import *
+#from config import *
+from detr import build_detr
+from utils import NestedTensor
+
+import misc as th_utils
+#config = get_config()
+#parser = argparse.ArgumentParser('')
+#parser.add_argument('-cfg', type=str, default='./configs/vit_large_patch16_224.yaml')
+##parser.add_argument('-dataset', type=str, default="imagenet2012")
+#parser.add_argument('-dataset', type=str, default="cifar10")
+#parser.add_argument('-batch_size', type=int, default=4)
+#parser.add_argument('-image_size', type=int, default=224)
+#parser.add_argument('-data_path', type=str, default='/dataset/imagenet/')
+#parser.add_argument('-eval', action="store_true")
+#parser.add_argument('-pretrained', type=str, default=None)
+#args = parser.parse_args()
+#
+#config = get_config()
+#config = update_config(config, args)
+#print(config)
+#
+#
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    map1 = torch_to_paddle_mapping_backbone()
+    map2 = torch_to_paddle_mapping_transformer()
+    map3 = torch_to_paddle_mapping_bn_from_buffer()
+    map1.extend(map2)
+    map1.extend(map3)
+    return map1
+
+def torch_to_paddle_mapping_bn_from_buffer():
+    mapping = [('backbone.0.body.bn1','backbone.0.body.bn1')]
+
+    block_depth = [3, 4, 6, 3]
+    for block_idx in range(1,5):
+        th_block_prefix = f'backbone.0.body.layer{block_idx}'
+        pp_block_prefix = f'backbone.0.body.layer{block_idx}'
+        mapping.append((f'{th_block_prefix}.0.downsample.1',
+                        f'{pp_block_prefix}.0.downsample.1'))
+
+        for layer_idx in range(block_depth[block_idx-1]):
+            th_prefix = f'{th_block_prefix}.{layer_idx}'
+            pp_prefix = f'{pp_block_prefix}.{layer_idx}'
+            layer_mapping = [
+                (f'{th_prefix}.bn1', f'{pp_prefix}.bn1'),
+                (f'{th_prefix}.bn2', f'{pp_prefix}.bn2'),
+                (f'{th_prefix}.bn3', f'{pp_prefix}.bn3'),
+            ]
+            mapping.extend(layer_mapping)
+    return mapping
+
+def torch_to_paddle_mapping_backbone():
+    mapping = [('backbone.0.body.conv1','backbone.0.body.conv1')]
+
+    block_depth = [3, 4, 6, 3]
+    for block_idx in range(1,5):
+        th_block_prefix = f'backbone.0.body.layer{block_idx}'
+        pp_block_prefix = f'backbone.0.body.layer{block_idx}'
+        mapping.append((f'{th_block_prefix}.0.downsample.0',
+                        f'{pp_block_prefix}.0.downsample.0'))
+
+        for layer_idx in range(block_depth[block_idx-1]):
+            th_prefix = f'{th_block_prefix}.{layer_idx}'
+            pp_prefix = f'{pp_block_prefix}.{layer_idx}'
+            layer_mapping = [
+                (f'{th_prefix}.conv1', f'{pp_prefix}.conv1'),
+                (f'{th_prefix}.conv2', f'{pp_prefix}.conv2'),
+                (f'{th_prefix}.conv3', f'{pp_prefix}.conv3'),
+            ]
+            mapping.extend(layer_mapping)
+    return mapping
+
+
+def torch_to_paddle_mapping_transformer():
+    mapping = [
+        ('class_embed', 'class_embed'),
+        ('query_embed', 'query_embed'),
+        ('input_proj', 'input_proj'),
+        ('bbox_embed.layers.0', 'bbox_embed.layers.0'),
+        ('bbox_embed.layers.1', 'bbox_embed.layers.1'),
+        ('bbox_embed.layers.2', 'bbox_embed.layers.2'),
+        ('transformer.decoder.norm', 'transformer.decoder.norm'),
+    ]
+
+    num_layers = 6
+    for idx in range(num_layers):
+        for module in ['encoder', 'decoder']:
+            pp_prefix = f'transformer.{module}.layers.{idx}'
+            th_prefix = f'transformer.{module}.layers.{idx}'
+            layer_mapping = [
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                (f'{th_prefix}.norm3', f'{pp_prefix}.norm3'),
+                (f'{th_prefix}.linear1', f'{pp_prefix}.mlp.linear1'), 
+                (f'{th_prefix}.linear2', f'{pp_prefix}.mlp.linear2'), 
+                (f'{th_prefix}.self_attn.in_proj_weight', f'{pp_prefix}.self_attn'),
+                (f'{th_prefix}.self_attn.in_proj_bias', f'{pp_prefix}.self_attn'),
+                (f'{th_prefix}.self_attn.out_proj', f'{pp_prefix}.self_attn.fc'),
+                (f'{th_prefix}.multihead_attn.in_proj_weight', f'{pp_prefix}.dec_enc_attn'),
+                (f'{th_prefix}.multihead_attn.in_proj_bias', f'{pp_prefix}.dec_enc_attn'),
+                (f'{th_prefix}.multihead_attn.out_proj', f'{pp_prefix}.dec_enc_attn.fc'),
+            ]
+            mapping.extend(layer_mapping)
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'***SET*** {th_name} {th_shape} ***TO*** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    def _set_value_attn(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        print(f'***SET*** {th_name} {th_shape} ***TO*** {pd_name}')
+        if 'weight' in th_name:
+            value = th_params[th_name].data.transpose(1, 0)
+            value = value.chunk(3, axis=-1)
+            q,k,v = value[0].numpy(), value[1].numpy(), value[2].numpy()
+            #q = q.transpose((1,0))
+            #k = k.transpose((1,0))
+            #v = v.transpose((1,0))
+            pd_params[f'{pd_name}.q.weight'].set_value(q)
+            pd_params[f'{pd_name}.k.weight'].set_value(k)
+            pd_params[f'{pd_name}.v.weight'].set_value(v)
+        elif 'bias' in th_name:
+            value = th_params[th_name].data
+            #print('00000000000000000000000000000000')
+            #print(value.shape)
+            #print(value)
+            value = value.chunk(3, axis=-1)
+            q,k,v = value[0].numpy(), value[1].numpy(), value[2].numpy()
+            #print('00000 q_b 00000')
+            #print(q)
+            #print('00000 k_b 00000')
+            #print(k)
+            #print('00000 v_b 00000')
+            #print(v)
+            pd_params[f'{pd_name}.q.bias'].set_value(q)
+            pd_params[f'{pd_name}.k.bias'].set_value(k)
+            pd_params[f'{pd_name}.v.bias'].set_value(v)
+
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, buff in paddle_model.named_buffers():
+        pd_params[name] = buff
+    for name, buff in torch_model.named_buffers():
+        th_params[name] = buff
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if 'self_attn' in th_name or 'multihead_attn' in th_name:
+                _set_value_attn(th_name, pd_name)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            if f'{th_name}.weight' in th_params.keys():
+                th_name_w = f'{th_name}.weight'
+                pd_name_w = f'{pd_name}.weight'
+                if th_name_w == 'query_embed.weight':
+                    _set_value(th_name_w, pd_name_w, transpose=False)
+                else:
+                    _set_value(th_name_w, pd_name_w)
+        
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+            if f'{th_name}.running_mean' in th_params.keys():
+                th_name_mean = f'{th_name}.running_mean'
+                pd_name_mean = f'{pd_name}._mean'
+                _set_value(th_name_mean, pd_name_mean)
+
+            if f'{th_name}.running_var' in th_params.keys():
+                th_name_mean = f'{th_name}.running_var'
+                pd_name_mean = f'{pd_name}._variance'
+                _set_value(th_name_mean, pd_name_mean)
+
+    return paddle_model
+
+    
+def get_nested_tensors():
+    with open('./t.npy', 'rb') as infile:
+        t = np.load(infile)
+        m = np.load(infile)
+        gts = np.load(infile, allow_pickle=True)
+
+    print(t.shape)
+    print(m.shape)
+
+    tt = torch.Tensor(t)
+    mm = torch.Tensor(m)
+    th_in = th_utils.NestedTensor(tt, mm)
+
+    ttt = paddle.to_tensor(t)
+    mmm = paddle.to_tensor(m)
+    pp_in = NestedTensor(ttt, mmm)
+
+    print(th_in, th_in.tensors.shape)
+    print(pp_in, pp_in.tensors.shape)
+
+    targets = []
+    for gt in gts:
+        target = dict()
+        for key, val in gt.items():
+            target[key] = paddle.to_tensor(val)
+        targets.append(target)
+    targets = tuple(targets)
+    pp_gt = targets
+
+
+    return pp_in, th_in, pp_gt
+
+
+
+
+#def get_nested_tensors():
+#    samples = paddle.load(path='./batch_samples_01.pdtensor')
+#    pp_in = NestedTensor(samples['tensors'], samples['mask'])
+#    pp_target = paddle.load(path='./batch_targets_01.pdtensor')
+#
+#    samples_tensor = samples['tensors'].cpu().numpy() 
+#    samples_mask = samples['mask'].cpu().numpy() 
+#    th_tensor = torch.Tensor(samples_tensor)
+#    th_mask = torch.Tensor(samples_mask)
+#    th_in = ThNestedTensor(th_tensor, th_mask)
+#    th_target = []
+#    for item in pp_target:
+#        sample_gt = dict()
+#        for key, val in item.items():
+#            th_tensor = torch.Tensor(val.cpu().numpy())
+#            sample_gt[key] = th_tensor
+#        th_target.append(sample_gt)
+#
+#    return th_in, th_target, pp_in, pp_target
+
+
+def get_nested_tensors_random():
+    x = np.random.randn(1, 3, 224, 224).astype('float32')
+    mask = np.ones([1, 224, 224])
+
+    pp_x = paddle.to_tensor(x)
+    pp_mask = paddle.to_tensor(mask)
+    pp_in = NestedTensor(pp_x, pp_mask)
+    th_tensor = torch.Tensor(x)
+    th_mask = torch.Tensor(mask)
+    th_in = ThNestedTensor(th_tensor, th_mask)
+    th_target = []
+    pp_target = []
+
+    return th_in, th_target, pp_in, pp_target
+
+
+def main():
+
+    paddle.set_device('gpu')
+
+    #th_in, th_target, pp_in, pp_target = get_nested_tensors()
+    
+    paddle_model, paddle_criterion, paddle_postprocessors = build_detr()
+    paddle_model.eval()
+
+    #print_model_named_params(paddle_model)
+    #print_model_named_buffers(paddle_model)
+    print('------------paddle model finish ----------------------')
+
+    device = torch.device('cpu')
+    torch_model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    #print_model_named_params(torch_model)
+    #print_model_named_buffers(torch_model)
+    print('----------torch model finish------------------------')
+     
+
+
+    # convert weights
+    #paddle_model = convert(torch_model, paddle_model)
+
+    model_dict = paddle.load('./detr_resnet50.pdparams')
+    paddle_model.set_dict(model_dict)
+
+
+    # check correctness
+    #th_in, th_target, pp_in, pp_target = get_nested_tensors()
+    #th_in, th_target, pp_in, pp_target = get_nested_tensors_random()
+    #x = np.random.randn(1, 3, 224, 224).astype('float32')
+    #x_paddle = paddle.to_tensor(x)
+    #x_torch = torch.Tensor(x).to(device)
+
+
+    pp_in, th_in, pp_gt = get_nested_tensors()
+
+    #print(pp_in.tensors)
+    #print(pp_in.mask)
+    #print('-------- pp in finish ------------------')
+    
+
+    #print(th_in.tensors, th_in.tensors.shape)
+    #print(th_in.mask, th_in.mask.shape)
+    #print('-------- th in finish ------------------')
+    
+
+    out_paddle = paddle_model(pp_in)
+    loss = paddle_criterion(out_paddle, pp_gt)
+    print('=============== loss =============')
+    for key, val in loss.items():
+        print(key, val.cpu().numpy())
+
+    #print(out_paddle['pred_logits'], out_paddle['pred_logits'].shape)
+    #print(out_paddle['pred_boxes'], out_paddle['pred_boxes'].shape)
+    #print('---------- paddle out finish ------------------------')
+
+    #out_torch = torch_model(th_in)
+    #print(out_torch['pred_logits'], out_torch['pred_logits'].shape)
+    #print(out_torch['pred_boxes'], out_torch['pred_boxes'].shape)
+    #print('---------- torch out finish ------------------------')
+
+    #out_torch = out_torch.data.cpu().numpy()
+    #out_paddle = out_paddle.cpu().numpy()
+
+    #print(out_torch.shape, out_paddle.shape)
+    #print(out_torch[0:100])
+    #print(out_paddle[0:100])
+    #assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+#    
+    # save weights for paddle model
+    #model_path = os.path.join('./detr_resnet50.pdparams')
+    #paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/object_detection/DETR/load_pytorch_weights_resnet101.py b/object_detection/DETR/load_pytorch_weights_resnet101.py
new file mode 100644
index 00000000..7e2f0809
--- /dev/null
+++ b/object_detection/DETR/load_pytorch_weights_resnet101.py
@@ -0,0 +1,392 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('/root/.cache/torch/hub/facebookresearch_detr_master/util/')
+
+from misc import NestedTensor as ThNestedTensor
+import os
+import argparse
+import numpy as np
+import paddle
+import torch
+#import timm
+#from transformer import *
+#from config import *
+from detr import build_detr
+from utils import NestedTensor
+
+import misc as th_utils
+#config = get_config()
+#parser = argparse.ArgumentParser('')
+#parser.add_argument('-cfg', type=str, default='./configs/vit_large_patch16_224.yaml')
+##parser.add_argument('-dataset', type=str, default="imagenet2012")
+#parser.add_argument('-dataset', type=str, default="cifar10")
+#parser.add_argument('-batch_size', type=int, default=4)
+#parser.add_argument('-image_size', type=int, default=224)
+#parser.add_argument('-data_path', type=str, default='/dataset/imagenet/')
+#parser.add_argument('-eval', action="store_true")
+#parser.add_argument('-pretrained', type=str, default=None)
+#args = parser.parse_args()
+#
+#config = get_config()
+#config = update_config(config, args)
+#print(config)
+#
+#
+def print_model_named_params(model):
+    for name, param in model.named_parameters():
+        print(name, param.shape)
+
+def print_model_named_buffers(model):
+    for name, buff in model.named_buffers():
+        print(name, buff.shape)
+
+
+def torch_to_paddle_mapping():
+    map1 = torch_to_paddle_mapping_backbone()
+    map2 = torch_to_paddle_mapping_transformer()
+    map3 = torch_to_paddle_mapping_bn_from_buffer()
+    map1.extend(map2)
+    map1.extend(map3)
+    return map1
+
+def torch_to_paddle_mapping_bn_from_buffer():
+    mapping = [('backbone.0.body.bn1','backbone.0.body.bn1')]
+
+    #block_depth = [3, 4, 6, 3]
+    block_depth = [3, 4, 23, 3] # resnet101
+
+    for block_idx in range(1,5):
+        th_block_prefix = f'backbone.0.body.layer{block_idx}'
+        pp_block_prefix = f'backbone.0.body.layer{block_idx}'
+        mapping.append((f'{th_block_prefix}.0.downsample.1',
+                        f'{pp_block_prefix}.0.downsample.1'))
+
+        for layer_idx in range(block_depth[block_idx-1]):
+            th_prefix = f'{th_block_prefix}.{layer_idx}'
+            pp_prefix = f'{pp_block_prefix}.{layer_idx}'
+            layer_mapping = [
+                (f'{th_prefix}.bn1', f'{pp_prefix}.bn1'),
+                (f'{th_prefix}.bn2', f'{pp_prefix}.bn2'),
+                (f'{th_prefix}.bn3', f'{pp_prefix}.bn3'),
+            ]
+            mapping.extend(layer_mapping)
+    return mapping
+
+def torch_to_paddle_mapping_backbone():
+    mapping = [('backbone.0.body.conv1','backbone.0.body.conv1')]
+
+    #block_depth = [3, 4, 6, 3] # resnet50
+    block_depth = [3, 4, 23, 3] # resnet101
+
+    for block_idx in range(1,5):
+        th_block_prefix = f'backbone.0.body.layer{block_idx}'
+        pp_block_prefix = f'backbone.0.body.layer{block_idx}'
+        mapping.append((f'{th_block_prefix}.0.downsample.0',
+                        f'{pp_block_prefix}.0.downsample.0'))
+
+        for layer_idx in range(block_depth[block_idx-1]):
+            th_prefix = f'{th_block_prefix}.{layer_idx}'
+            pp_prefix = f'{pp_block_prefix}.{layer_idx}'
+            layer_mapping = [
+                (f'{th_prefix}.conv1', f'{pp_prefix}.conv1'),
+                (f'{th_prefix}.conv2', f'{pp_prefix}.conv2'),
+                (f'{th_prefix}.conv3', f'{pp_prefix}.conv3'),
+            ]
+            mapping.extend(layer_mapping)
+    return mapping
+
+
+def torch_to_paddle_mapping_transformer():
+    mapping = [
+        ('class_embed', 'class_embed'),
+        ('query_embed', 'query_embed'),
+        ('input_proj', 'input_proj'),
+        ('bbox_embed.layers.0', 'bbox_embed.layers.0'),
+        ('bbox_embed.layers.1', 'bbox_embed.layers.1'),
+        ('bbox_embed.layers.2', 'bbox_embed.layers.2'),
+        ('transformer.decoder.norm', 'transformer.decoder.norm'),
+    ]
+
+    num_layers = 6
+    for idx in range(num_layers):
+        for module in ['encoder', 'decoder']:
+            pp_prefix = f'transformer.{module}.layers.{idx}'
+            th_prefix = f'transformer.{module}.layers.{idx}'
+            layer_mapping = [
+                (f'{th_prefix}.norm1', f'{pp_prefix}.norm1'),
+                (f'{th_prefix}.norm2', f'{pp_prefix}.norm2'),
+                (f'{th_prefix}.norm3', f'{pp_prefix}.norm3'),
+                (f'{th_prefix}.linear1', f'{pp_prefix}.mlp.linear1'), 
+                (f'{th_prefix}.linear2', f'{pp_prefix}.mlp.linear2'), 
+                (f'{th_prefix}.self_attn.in_proj_weight', f'{pp_prefix}.self_attn'),
+                (f'{th_prefix}.self_attn.in_proj_bias', f'{pp_prefix}.self_attn'),
+                (f'{th_prefix}.self_attn.out_proj', f'{pp_prefix}.self_attn.fc'),
+                (f'{th_prefix}.multihead_attn.in_proj_weight', f'{pp_prefix}.dec_enc_attn'),
+                (f'{th_prefix}.multihead_attn.in_proj_bias', f'{pp_prefix}.dec_enc_attn'),
+                (f'{th_prefix}.multihead_attn.out_proj', f'{pp_prefix}.dec_enc_attn.fc'),
+            ]
+            mapping.extend(layer_mapping)
+    return mapping
+
+
+
+def convert(torch_model, paddle_model):
+    def _set_value(th_name, pd_name, transpose=True):
+        th_shape = th_params[th_name].shape
+        pd_shape = tuple(pd_params[pd_name].shape) # paddle shape default type is list
+        #assert th_shape == pd_shape, f'{th_shape} != {pd_shape}'
+        print(f'***SET*** {th_name} {th_shape} ***TO*** {pd_name} {pd_shape}')
+        if isinstance(th_params[th_name], torch.nn.parameter.Parameter):
+            value = th_params[th_name].data.numpy()
+        else:
+            value = th_params[th_name].numpy()
+        if len(value.shape) == 2 and transpose:
+            value = value.transpose((1, 0))
+        pd_params[pd_name].set_value(value)
+
+    def _set_value_attn(th_name, pd_name):
+        th_shape = th_params[th_name].shape
+        print(f'***SET*** {th_name} {th_shape} ***TO*** {pd_name}')
+        if 'weight' in th_name:
+            value = th_params[th_name].data.transpose(1, 0)
+            value = value.chunk(3, axis=-1)
+            q,k,v = value[0].numpy(), value[1].numpy(), value[2].numpy()
+            #q = q.transpose((1,0))
+            #k = k.transpose((1,0))
+            #v = v.transpose((1,0))
+            pd_params[f'{pd_name}.q.weight'].set_value(q)
+            pd_params[f'{pd_name}.k.weight'].set_value(k)
+            pd_params[f'{pd_name}.v.weight'].set_value(v)
+        elif 'bias' in th_name:
+            value = th_params[th_name].data
+            #print('00000000000000000000000000000000')
+            #print(value.shape)
+            #print(value)
+            value = value.chunk(3, axis=-1)
+            q,k,v = value[0].numpy(), value[1].numpy(), value[2].numpy()
+            #print('00000 q_b 00000')
+            #print(q)
+            #print('00000 k_b 00000')
+            #print(k)
+            #print('00000 v_b 00000')
+            #print(v)
+            pd_params[f'{pd_name}.q.bias'].set_value(q)
+            pd_params[f'{pd_name}.k.bias'].set_value(k)
+            pd_params[f'{pd_name}.v.bias'].set_value(v)
+
+
+    # 1. get paddle and torch model parameters
+    pd_params = {}
+    th_params = {}
+    for name, param in paddle_model.named_parameters():
+        pd_params[name] = param
+    for name, param in torch_model.named_parameters():
+        th_params[name] = param
+
+    for name, buff in paddle_model.named_buffers():
+        pd_params[name] = buff
+    for name, buff in torch_model.named_buffers():
+        th_params[name] = buff
+
+    # 2. get name mapping pairs
+    mapping = torch_to_paddle_mapping()
+    # 3. set torch param values to paddle params: may needs transpose on weights
+    for th_name, pd_name in mapping:
+        if th_name in th_params.keys(): # nn.Parameters
+            if 'self_attn' in th_name or 'multihead_attn' in th_name:
+                _set_value_attn(th_name, pd_name)
+            else:
+                _set_value(th_name, pd_name)
+        else: # weight & bias
+            if f'{th_name}.weight' in th_params.keys():
+                th_name_w = f'{th_name}.weight'
+                pd_name_w = f'{pd_name}.weight'
+                if th_name_w == 'query_embed.weight':
+                    _set_value(th_name_w, pd_name_w, transpose=False)
+                else:
+                    _set_value(th_name_w, pd_name_w)
+        
+            if f'{th_name}.bias' in th_params.keys():
+                th_name_b = f'{th_name}.bias'
+                pd_name_b = f'{pd_name}.bias'
+                _set_value(th_name_b, pd_name_b)
+
+            if f'{th_name}.running_mean' in th_params.keys():
+                th_name_mean = f'{th_name}.running_mean'
+                pd_name_mean = f'{pd_name}._mean'
+                _set_value(th_name_mean, pd_name_mean)
+
+            if f'{th_name}.running_var' in th_params.keys():
+                th_name_mean = f'{th_name}.running_var'
+                pd_name_mean = f'{pd_name}._variance'
+                _set_value(th_name_mean, pd_name_mean)
+
+    return paddle_model
+
+    
+def get_nested_tensors():
+    with open('./t.npy', 'rb') as infile:
+        t = np.load(infile)
+        m = np.load(infile)
+        gts = np.load(infile, allow_pickle=True)
+
+    print(t.shape)
+    print(m.shape)
+
+    tt = torch.Tensor(t)
+    mm = torch.Tensor(m)
+    th_in = th_utils.NestedTensor(tt, mm)
+
+    ttt = paddle.to_tensor(t)
+    mmm = paddle.to_tensor(m)
+    pp_in = NestedTensor(ttt, mmm)
+
+    print(th_in, th_in.tensors.shape)
+    print(pp_in, pp_in.tensors.shape)
+
+    targets = []
+    for gt in gts:
+        target = dict()
+        for key, val in gt.items():
+            target[key] = paddle.to_tensor(val)
+        targets.append(target)
+    targets = tuple(targets)
+    pp_gt = targets
+
+
+    return pp_in, th_in, pp_gt
+
+
+
+
+#def get_nested_tensors():
+#    samples = paddle.load(path='./batch_samples_01.pdtensor')
+#    pp_in = NestedTensor(samples['tensors'], samples['mask'])
+#    pp_target = paddle.load(path='./batch_targets_01.pdtensor')
+#
+#    samples_tensor = samples['tensors'].cpu().numpy() 
+#    samples_mask = samples['mask'].cpu().numpy() 
+#    th_tensor = torch.Tensor(samples_tensor)
+#    th_mask = torch.Tensor(samples_mask)
+#    th_in = ThNestedTensor(th_tensor, th_mask)
+#    th_target = []
+#    for item in pp_target:
+#        sample_gt = dict()
+#        for key, val in item.items():
+#            th_tensor = torch.Tensor(val.cpu().numpy())
+#            sample_gt[key] = th_tensor
+#        th_target.append(sample_gt)
+#
+#    return th_in, th_target, pp_in, pp_target
+
+
+def get_nested_tensors_random():
+    x = np.random.randn(1, 3, 224, 224).astype('float32')
+    mask = np.ones([1, 224, 224])
+
+    pp_x = paddle.to_tensor(x)
+    pp_mask = paddle.to_tensor(mask)
+    pp_in = NestedTensor(pp_x, pp_mask)
+    th_tensor = torch.Tensor(x)
+    th_mask = torch.Tensor(mask)
+    th_in = ThNestedTensor(th_tensor, th_mask)
+    th_target = []
+    pp_target = []
+
+    return th_in, th_target, pp_in, pp_target
+
+
+def main():
+
+    paddle.set_device('cpu')
+    #paddle.set_device('gpu')
+
+    #th_in, th_target, pp_in, pp_target = get_nested_tensors()
+    
+    paddle_model, paddle_criterion, paddle_postprocessors = build_detr()
+    paddle_model.eval()
+
+    print_model_named_params(paddle_model)
+    print_model_named_buffers(paddle_model)
+    print('------------paddle model finish ----------------------')
+
+    device = torch.device('cpu')
+    torch_model = torch.hub.load('facebookresearch/detr', 'detr_resnet101', pretrained=True)
+    torch_model = torch_model.to(device)
+    torch_model.eval()
+
+    print_model_named_params(torch_model)
+    print_model_named_buffers(torch_model)
+    print('----------torch model finish------------------------')
+     
+
+    # convert weights
+    paddle_model = convert(torch_model, paddle_model)
+
+    #model_dict = paddle.load('./detr_resnet101.pdparams')
+    #paddle_model.set_dict(model_dict)
+
+
+    # check correctness
+    #th_in, th_target, pp_in, pp_target = get_nested_tensors()
+    #th_in, th_target, pp_in, pp_target = get_nested_tensors_random()
+    #x = np.random.randn(1, 3, 224, 224).astype('float32')
+    #x_paddle = paddle.to_tensor(x)
+    #x_torch = torch.Tensor(x).to(device)
+
+
+    pp_in, th_in, pp_gt = get_nested_tensors()
+
+    #print(pp_in.tensors)
+    #print(pp_in.mask)
+    #print('-------- pp in finish ------------------')
+    
+
+    #print(th_in.tensors, th_in.tensors.shape)
+    #print(th_in.mask, th_in.mask.shape)
+    #print('-------- th in finish ------------------')
+    
+
+    #out_paddle = paddle_model(pp_in)
+    #loss = paddle_criterion(out_paddle, pp_gt)
+    #print('=============== loss =============')
+    #for key, val in loss.items():
+    #    print(key, val.cpu().numpy())
+
+    #print(out_paddle['pred_logits'], out_paddle['pred_logits'].shape)
+    #print(out_paddle['pred_boxes'], out_paddle['pred_boxes'].shape)
+    #print('---------- paddle out finish ------------------------')
+
+    #out_torch = torch_model(th_in)
+    #print(out_torch['pred_logits'], out_torch['pred_logits'].shape)
+    #print(out_torch['pred_boxes'], out_torch['pred_boxes'].shape)
+    #print('---------- torch out finish ------------------------')
+
+    #out_torch = out_torch.data.cpu().numpy()
+    #out_paddle = out_paddle.cpu().numpy()
+
+    #print(out_torch.shape, out_paddle.shape)
+    #print(out_torch[0:100])
+    #print(out_paddle[0:100])
+    #assert np.allclose(out_torch, out_paddle, atol = 1e-5)
+#    
+    # save weights for paddle model
+    model_path = os.path.join('./detr_resnet101.pdparams')
+    paddle.save(paddle_model.state_dict(), model_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/object_detection/DETR/main_multi_gpu.py b/object_detection/DETR/main_multi_gpu.py
new file mode 100644
index 00000000..43db92e5
--- /dev/null
+++ b/object_detection/DETR/main_multi_gpu.py
@@ -0,0 +1,405 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DETR training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from coco import build_coco
+from coco import get_dataloader
+from coco_eval import CocoEvaluator
+from detr import build_detr
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('DETR')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-backbone', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-eval', action='store_true')
+arguments = parser.parse_args()
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+#config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          postprocessors,
+          base_ds,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, DETR model
+        criterion: nn.Layer
+        postprocessors: nn.Layer
+        base_ds: coco api instance
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+        accum_iter: int, num of iters for accumulating gradients
+    Returns:
+        train_loss_ce_meter.avg
+        train_loss_bbox_meter.avg
+        train_loss_giou_meter.avg
+        train_time
+    """
+
+    model.train()
+    criterion.train()
+    train_loss_ce_meter = AverageMeter()
+    train_loss_bbox_meter = AverageMeter()
+    train_loss_giou_meter = AverageMeter()
+    time_st = time.time()
+    iou_types = ('bbox', )
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+
+    for batch_id, data in enumerate(dataloader):
+        samples = data[0]
+        targets = data[1]
+        #targets = [{k:v for k,v in t.items()} for t in targets]
+            
+        outputs = model(samples)
+        loss_dict = criterion(outputs, targets)
+        weight_dict = criterion.weight_dict
+        losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        losses.backward()
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        # logging losses
+        batch_size = samples.tensors.shape[0]
+        train_loss_ce_meter.update(loss_dict['loss_ce'].numpy()[0], batch_size)
+        train_loss_bbox_meter.update(loss_dict['loss_bbox'].numpy()[0], batch_size)
+        train_loss_giou_meter.update(loss_dict['loss_giou'].numpy()[0], batch_size)
+    
+        if batch_id > 0 and batch_id % debug_steps == 0:
+            logger.info(
+                f"Train Step[{batch_id:04d}/{total_batch:04d}], " + 
+                f"Avg loss_ce: {train_loss_ce_meter.avg:.4f}, " + 
+                f"Avg loss_bbox: {train_loss_bbox_meter.avg:.4f}, " + 
+                f"Avg loss_giou: {train_loss_giou_meter.avg:.4f}, ") 
+
+    train_time = time.time() - time_st
+    return train_loss_ce_meter.avg, train_loss_bbox_meter.avg, train_loss_giou_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, postprocessors, base_ds, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: criterion
+        postprocessors: postprocessor for generating bboxes
+        base_ds: COCO instance
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc_meter.avg
+        val_time
+    """
+    model.eval()
+    criterion.eval()
+
+    val_loss_ce_meter = AverageMeter()
+    val_loss_bbox_meter = AverageMeter()
+    val_loss_giou_meter = AverageMeter()
+
+    time_st = time.time()
+
+    iou_types = ('bbox', )
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            samples = data[0]
+            targets = data[1]
+            targets = [{k:v for k, v in t.items()} for t in targets]
+
+            outputs = model(samples)
+            loss_dict = criterion(outputs, targets)
+            weight_dict = criterion.weight_dict
+
+            # logging val losses
+            dist.all_reduce(loss_dict['loss_ce'])
+            dist.all_reduce(loss_dict['loss_bbox'])
+            dist.all_reduce(loss_dict['loss_giou'])
+            loss_dict['loss_ce'] /= dist.get_world_size()
+            loss_dict['loss_bbox'] /= dist.get_world_size()
+            loss_dict['loss_giou'] /= dist.get_world_size()
+
+            batch_size = paddle.to_tensor(samples.tensors.shape[0])
+            dist.all_reduce(batch_size)
+            batch_size = batch_size.numpy()[0]
+            val_loss_ce_meter.update(loss_dict['loss_ce'].numpy()[0], batch_size)
+            val_loss_bbox_meter.update(loss_dict['loss_bbox'].numpy()[0], batch_size)
+            val_loss_giou_meter.update(loss_dict['loss_giou'].numpy()[0], batch_size)
+
+            if batch_id > 0 and batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg loss_ce: {val_loss_ce_meter.avg:.4f}, " +
+                    f"Avg loss_bbox: {val_loss_bbox_meter.avg:.4f}, " +
+                    f"Avg loss_giou: {val_loss_giou_meter.avg:.4f}, ")
+
+            # coco evaluate
+            orig_target_sizes = paddle.stack([t['orig_size'] for t in targets], axis=0)
+            results = postprocessors['bbox'](outputs, orig_target_sizes)
+            res = {target['image_id'].cpu().numpy()[0]: output for target, output in zip(targets, results)}
+            
+            if coco_evaluator is not None:
+                coco_evaluator.update(res)
+
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize()
+
+    val_time = time.time() - time_st
+    return val_loss_ce_meter.avg, val_loss_bbox_meter.avg, val_loss_giou_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model, criterion, postprocessors = build_detr(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    total_batch_train = 0
+    if not config.EVAL:
+        dataloader_train = get_dataloader(dataset_train,
+                                      batch_size=config.DATA.BATCH_SIZE,
+                                      mode='train',
+                                      multi_gpu=True)
+        total_batch_train = len(dataloader_train)
+
+    dataloader_val = get_dataloader(dataset_val,
+                                batch_size=config.DATA.BATCH_SIZE_EVAL,
+                                mode='val',
+                                multi_gpu=True)
+    total_batch_val = len(dataloader_val)
+    base_ds = dataset_val.coco # pycocotools.coco.COCO(anno_file)
+
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss_ce, val_loss_bbox, val_loss_giou, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            postprocessors=postprocessors,
+            base_ds=base_ds,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss_ce: {val_loss_ce:.4f}, " +
+                    f"Validation Loss_bbox: {val_loss_bbox:.4f}, " +
+                    f"Validation Loss_giou: {val_loss_giou:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  postprocessors=postprocessors,
+                                                  base_ds=base_ds,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss_ce, val_loss_bbox, val_loss_giou, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                postprocessors=postprocessors,
+                base_ds=base_ds,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss_ce: {val_loss_ce:.4f}, " +
+                        f"Validation Loss_bbox: {val_loss_bbox:.4f}, " +
+                        f"Validation Loss_giou: {val_loss_giou:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path)
+                paddle.save(optimizer.state_dict(), model_path)
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    if not config.EVAL:
+        dataset_train = build_coco('train', config.DATA.DATA_PATH)
+    else:
+        dataset_train = None
+    dataset_val = build_coco('val', config.DATA.DATA_PATH)
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/object_detection/DETR/main_single_gpu.py b/object_detection/DETR/main_single_gpu.py
new file mode 100644
index 00000000..6330105c
--- /dev/null
+++ b/object_detection/DETR/main_single_gpu.py
@@ -0,0 +1,318 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from coco import build_coco
+from coco import get_dataloader
+from coco_eval import CocoEvaluator
+from detr import build_detr
+from config import get_config
+from config import update_config
+from utils import WarmupCosineScheduler
+from utils import AverageMeter
+
+
+parser = argparse.ArgumentParser('DETR')
+parser.add_argument('-cfg', type=str, default='./configs/detr_resnet50.yaml')
+parser.add_argument('-dataset', type=str, default="coco")
+parser.add_argument('-batch_size', type=int, default=4)
+parser.add_argument('-data_path', type=str, default='/dataset/coco/')
+parser.add_argument('-backbone', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-eval', action='store_true')
+args = parser.parse_args()
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+config = get_config()
+config = update_config(config, args)
+
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+config.freeze()
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader, model, criterion, postprocessors, base_ds, optimizer, epoch, total_batch, debug_steps=100, accum_iter=1):
+    model.train()
+    criterion.train()
+
+    train_loss_ce_meter = AverageMeter()
+    train_loss_bbox_meter = AverageMeter()
+    train_loss_giou_meter = AverageMeter()
+
+    time_st = time.time()
+
+    iou_types = ('bbox', )
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+
+    for batch_id, data in enumerate(dataloader):
+        samples = data[0]
+        targets = data[1]
+        #targets = [{k:v for k,v in t.items()} for t in targets]
+            
+        outputs = model(samples)
+        loss_dict = criterion(outputs, targets)
+        weight_dict = criterion.weight_dict
+        losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        losses.backward()
+        if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+            optimizer.step()
+            optimizer.clear_grad()
+
+        # logging losses
+        batch_size = samples.tensors.shape[0]
+        train_loss_ce_meter.update(loss_dict['loss_ce'].numpy()[0], batch_size)
+        train_loss_bbox_meter.update(loss_dict['loss_bbox'].numpy()[0], batch_size)
+        train_loss_giou_meter.update(loss_dict['loss_giou'].numpy()[0], batch_size)
+    
+        if batch_id > 0 and batch_id % debug_steps == 0:
+            logger.info(
+                f"Train Step[{batch_id:04d}/{total_batch:04d}], " + 
+                f"Avg loss_ce: {train_loss_ce_meter.avg:.4f}, " + 
+                f"Avg loss_bbox: {train_loss_bbox_meter.avg:.4f}, " + 
+                f"Avg loss_giou: {train_loss_giou_meter.avg:.4f}, ") 
+
+    train_time = time.time() - time_st
+    return train_loss_ce_meter.avg, train_loss_bbox_meter.avg, train_loss_giou_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, postprocessors, base_ds, total_batch, debug_steps=100):
+    model.eval()
+    criterion.eval()
+
+    val_loss_ce_meter = AverageMeter()
+    val_loss_bbox_meter = AverageMeter()
+    val_loss_giou_meter = AverageMeter()
+
+    time_st = time.time()
+
+    iou_types = ('bbox', )
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            samples = data[0]
+            targets = data[1]
+            #targets = [{k:v for k,v in t.items()} for t in targets]
+            
+            outputs = model(samples)
+            loss_dict = criterion(outputs, targets)
+            weight_dict = criterion.weight_dict
+
+            # logging val losses
+            batch_size = samples.tensors.shape[0]
+            val_loss_ce_meter.update(loss_dict['loss_ce'].numpy()[0], batch_size)
+            val_loss_bbox_meter.update(loss_dict['loss_bbox'].numpy()[0], batch_size)
+            val_loss_giou_meter.update(loss_dict['loss_giou'].numpy()[0], batch_size)
+    
+            if batch_id > 0 and batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " + 
+                    f"Avg loss_ce: {val_loss_ce_meter.avg:.4f}, " + 
+                    f"Avg loss_bbox: {val_loss_bbox_meter.avg:.4f}, " + 
+                    f"Avg loss_giou: {val_loss_giou_meter.avg:.4f}, ") 
+
+            # coco evaluate
+            orig_target_sizes = paddle.stack([t['orig_size'] for t in targets], axis=0)
+            results = postprocessors['bbox'](outputs, orig_target_sizes)
+            res = {target['image_id'].cpu().numpy()[0]: output for target, output in zip(targets, results)}
+            if coco_evaluator is not None:
+                coco_evaluator.update(res)
+
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize() #TODO: get stats[0] and return mAP
+
+    val_time = time.time() - time_st
+    return val_loss_ce_meter.avg, val_loss_bbox_meter.avg, val_loss_giou_meter.avg, val_time
+
+
+def main():
+    # 0. Preparation
+    last_epoch = config.TRAIN.LAST_EPOCH
+    seed = config.SEED
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # TODO: set backbone_lr
+    # 1. Create model and criterion
+    model, criterion, postprocessors = build_detr(config)
+    # 2. Create train and val dataloader
+    if not config.EVAL:
+        dataset_train = build_coco('train', config.DATA.DATA_PATH)
+        dataloader_train = get_dataloader(dataset_train,
+                                      batch_size=config.DATA.BATCH_SIZE,
+                                      mode='train', 
+                                      multi_gpu=False)
+
+    dataset_val = build_coco('val', config.DATA.DATA_PATH)
+    dataloader_val = get_dataloader(dataset_val,
+                                batch_size=config.DATA.BATCH_SIZE_EVAL,
+                                mode='val', 
+                                multi_gpu=False)
+
+    base_ds = dataset_val.coco   # pycocotools.coco.COCO(anno_file)
+    # 3. Define lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, 
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR, 
+                                                       milestones=milestons,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    # 5. Define optimizer
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(parameters=model.parameters(),
+                                     learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+                                     weight_decay=config.TRAIN.WEIGHT_DECAY,
+                                     momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+                                     grad_clip=clip,
+                                     )
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        optimizer = paddle.optimizer.AdamW(parameters=model.parameters(),
+                                       beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+                                       beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+                                       epsilon=config.TRAIN.OPTIMIZER.EPS,
+                                       )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 6. Load pretrained model or load resume model and optimizer states
+    if config.MODEL.PRETRAINED:
+    #if config.MODEL.PRETRAINED and os.path.isfile(config.MODEL.PRETRAINED + '.pdparams'):
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams')
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') 
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME and os.path.isfile(config.MODEL.RESUME+'.pdparams') and os.path.isfile(config.MODEL.RESUME+'.pdopt'):
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams') 
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') 
+        optimizer.set_dict(opt_state)
+        logger.info(f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+
+    # 7. Validation
+    if config.EVAL:
+        logger.info(f'----- Start Validating')
+        val_loss_ce, val_loss_bbox, val_loss_giou, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            postprocessors=postprocessors,
+            base_ds=base_ds,
+            total_batch=len(dataloader_val),
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss ce: {val_loss_ce:.4f}, " +
+                    f"Validation Loss bbox: {val_loss_bbox:.4f}, " +
+                    f"Validation Loss giou: {val_loss_giou:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 8. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss_ce, train_loss_bbox, train_loss_giou, train_time = train(
+            dataloader=dataloader_train,
+            model=model, 
+            criterion=criterion, 
+            postprocessors=postprocessors,
+            base_ds=base_ds,
+            optimizer=optimizer, 
+            epoch=epoch,
+            total_batch=len(dataloader_train),
+            debug_steps=config.REPORT_FREQ,
+            accum_iter=config.TRAIN.ACCUM_ITER)
+        scheduler.step()
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss ce: {train_loss_ce:.4f}, " +
+                    f"Train Loss bbox: {train_loss_bbox:.4f}, " +
+                    f"Train Loss giou: {train_loss_giou:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss_ce, val_loss_bbox, val_loss_giou, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                postprocessors=postprocessors,
+                base_ds=base_ds,
+                total_batch=len(dataloader_val),
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"Validation Loss ce: {val_loss_ce:.4f}, " +
+                        f"Validation Loss bbox: {val_loss_bbox:.4f}, " +
+                        f"Validation Loss giou: {val_loss_giou:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            model_path = os.path.join(config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+            paddle.save(model.state_dict(), model_path)
+            paddle.save(optimizer.state_dict(), model_path)
+            logger.info(f"----- Save model: {model_path}.pdparams")
+            logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/object_detection/DETR/matcher.py b/object_detection/DETR/matcher.py
new file mode 100644
index 00000000..57aa328e
--- /dev/null
+++ b/object_detection/DETR/matcher.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Hugraian matching algorithm for predictions and targets
+"""
+
+from scipy.optimize import linear_sum_assignment
+from scipy.spatial import distance
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+
+class HungarianMatcher(nn.Layer):
+    def __init__(self, cost_class=1., cost_bbox=1., cost_giou=1.):
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs: dict contains 'pred_logits' and 'pred_boxes'
+                pred_logits: [batch_size, num_queires, num_classes]
+                pred_boxes: [batch_size, num_queires, 4]
+            targets: list(tuple) of targets, len(targets) = batch_size, each target is a dict contains at least 'labels' and 'bboxes'
+                labels: [num_target_boxes], containing the class labels
+                boxes: [num_target_boxes, 4], containing the gt bboxes
+        """
+        with paddle.no_grad():
+            batch_size, num_queries = outputs['pred_logits'].shape[:2]
+            # outputs: [batch_size , num_queries , num_classes]
+            # pred_boxes: [batch_size , num_queries , 4
+            #print('========= orig pred boxes ======')
+            #print(outputs['pred_boxes'])
+
+            out_prob = F.softmax(outputs['pred_logits'].flatten(0, 1), -1) # [batch_size*num_queries, num_classes]
+            out_bbox = outputs['pred_boxes'].flatten(0, 1) #[batch_size*num_queries, 4]
+
+            #print('-------targets----------')
+            #print(targets)
+            # torch no error: torch.cat([torch.empty([0])]), returns tensor([])
+            # paddle raise error: paddle.concat([paddle.empty([0])]), raise ValueError 
+            #print([v['labels'] for v in targets])
+
+            idx_list = []
+            for v in targets:
+                if not v['labels'].is_empty():
+                    idx_list.append(v['labels'])
+            if len(idx_list) > 0:
+                tgt_idx = paddle.concat(idx_list)
+                tgt_idx = tgt_idx.astype('int32')
+            else:
+                tgt_idx = paddle.empty([0], dtype='int32')
+                    
+            #tgt_idx = paddle.concat([v['labels'] for v in targets])
+            #tgt_idx = tgt_idx.astype('int32')
+
+            #tgt_bbox = paddle.concat([v['boxes'] for v in targets])
+            bbox_list = []
+            for v in targets:
+                if not v['boxes'].is_empty():
+                    bbox_list.append(v['boxes'])
+            if len(bbox_list) > 0:
+                tgt_bbox = paddle.concat(bbox_list)
+            else:
+                tgt_bbox = paddle.empty([0], dtype='float32')
+
+            
+            ## SAME
+            #print('out_bbox', out_bbox, out_bbox.shape)
+            #print('tgt_bbox,', tgt_bbox, tgt_bbox.shape)
+
+            if tgt_idx.is_empty():
+                cost_class = 0
+            else:
+                cost_class = -paddle.index_select(out_prob, tgt_idx, axis=1)
+            #print('cost_class = ', cost_class)
+
+            #cost_bbox = paddle.cdist(out_bbox, tgt_bbox, p=1) # TODO: impl paddle cdist for tensors
+            # conver back to numpy for temp use
+            out_bbox = out_bbox.cpu().numpy()
+            tgt_bbox = tgt_bbox.cpu().numpy()
+            cost_bbox = distance.cdist(out_bbox, tgt_bbox, 'minkowski', p=1).astype('float32')
+            cost_bbox = paddle.to_tensor(cost_bbox)
+
+            out_bbox = paddle.to_tensor(out_bbox)
+            tgt_bbox = paddle.to_tensor(tgt_bbox)
+
+            # SAME
+            #print('cost_bbox, ', cost_bbox.shape)
+            #print('cost_bbox =', cost_bbox)
+            
+            cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+            #SAME
+            #print('cost_giou', cost_giou, cost_giou.shape)
+
+            C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+            C = C.reshape([batch_size, num_queries, -1])
+            sizes = [len(v['boxes']) for v in targets]
+
+            # When sizes = [0, n] (no boxes)
+            # pytorch C.split(sizes, -1)[0][0] returns: tensor([], size=(100, 0))
+            # but paddle C.split(sizes, -1)[0][0] raises error
+            # original code in pytorch:
+            #idxs = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+            # We fix for paddle:
+            idxs = []
+            for i, c in enumerate(C.split(sizes, -1)):
+                if c.shape[-1] == 0:
+                    idx = linear_sum_assignment(paddle.empty((c.shape[1], c.shape[2])))
+                else:
+                    idx = linear_sum_assignment(c[i])
+                idxs.append(idx)
+
+
+            #SAME
+            #print('idxs=', idxs)
+
+            return [(paddle.to_tensor(i, dtype='int64'), paddle.to_tensor(j, dtype='int64')) for i,j in idxs]
+
+
+def build_matcher():
+    return HungarianMatcher(cost_class=1, cost_bbox=5, cost_giou=2)
+
diff --git a/object_detection/DETR/mixup.py b/object_detection/DETR/mixup.py
new file mode 100644
index 00000000..a6578146
--- /dev/null
+++ b/object_detection/DETR/mixup.py
@@ -0,0 +1,312 @@
+""" Mixup and Cutmix
+Papers:
+mixup: Beyond Empirical Risk Minimization (https://arxiv.org/abs/1710.09412)
+CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features (https://arxiv.org/abs/1905.04899)
+Code Reference:
+CutMix: https://github.com/clovaai/CutMix-PyTorch
+"""
+import numpy as np
+import paddle
+import paddle.nn.Functional as F
+
+
+def one_hot(x, num_classes, on_value=1., off_value=0.):
+    one_hot = F.one_hot(x, num_classes)
+    return paddle.scatter_(paddle.full((x.shape[0], num_classes), off_value), x, on_value)
+
+
+def mixup_target(target, num_classes, lam=1., smoothing=0.0, device='cuda'):
+    off_value = smoothing / num_classes
+    on_value = 1. - smoothing + off_value
+    y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device)
+    y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device)
+    return y1 * lam + y2 * (1. - lam)
+
+
+def rand_bbox(img_shape, lam, margin=0., count=None):
+    """ Standard CutMix bounding-box
+    Generates a random square bbox based on lambda value. This impl includes
+    support for enforcing a border margin as percent of bbox dimensions.
+
+    Args:
+        img_shape (tuple): Image shape as tuple
+        lam (float): Cutmix lambda value
+        margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)
+        count (int): Number of bbox to generate
+    """
+    ratio = np.sqrt(1 - lam)
+    img_h, img_w = img_shape[-2:]
+    cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
+    margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
+    cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
+    cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
+    yl = np.clip(cy - cut_h // 2, 0, img_h)
+    yh = np.clip(cy + cut_h // 2, 0, img_h)
+    xl = np.clip(cx - cut_w // 2, 0, img_w)
+    xh = np.clip(cx + cut_w // 2, 0, img_w)
+    return yl, yh, xl, xh
+
+
+def rand_bbox_minmax(img_shape, minmax, count=None):
+    """ Min-Max CutMix bounding-box
+    Inspired by Darknet cutmix impl, generates a random rectangular bbox
+    based on min/max percent values applied to each dimension of the input image.
+
+    Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9 range for max.
+
+    Args:
+        img_shape (tuple): Image shape as tuple
+        minmax (tuple or list): Min and max bbox ratios (as percent of image size)
+        count (int): Number of bbox to generate
+    """
+    assert len(minmax) == 2
+    img_h, img_w = img_shape[-2:]
+    cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count)
+    cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count)
+    yl = np.random.randint(0, img_h - cut_h, size=count)
+    xl = np.random.randint(0, img_w - cut_w, size=count)
+    yu = yl + cut_h
+    xu = xl + cut_w
+    return yl, yu, xl, xu
+
+
+def cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None):
+    """ Generate bbox and apply lambda correction.
+    """
+    if ratio_minmax is not None:
+        yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count)
+    else:
+        yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count)
+    if correct_lam or ratio_minmax is not None:
+        bbox_area = (yu - yl) * (xu - xl)
+        lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1])
+    return (yl, yu, xl, xu), lam
+
+
+class Mixup:
+    """ Mixup/Cutmix that applies different params to each element or whole batch
+
+    Args:
+        mixup_alpha (float): mixup alpha value, mixup is active if > 0.
+        cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0.
+        cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
+        prob (float): probability of applying mixup or cutmix per batch or element
+        switch_prob (float): probability of switching to cutmix instead of mixup when both are active
+        mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
+        correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders
+        label_smoothing (float): apply label smoothing to the mixed target tensor
+        num_classes (int): number of classes for target
+    """
+    def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5,
+                 mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000):
+        self.mixup_alpha = mixup_alpha
+        self.cutmix_alpha = cutmix_alpha
+        self.cutmix_minmax = cutmix_minmax
+        if self.cutmix_minmax is not None:
+            assert len(self.cutmix_minmax) == 2
+            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
+            self.cutmix_alpha = 1.0
+        self.mix_prob = prob
+        self.switch_prob = switch_prob
+        self.label_smoothing = label_smoothing
+        self.num_classes = num_classes
+        self.mode = mode
+        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
+        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)
+
+    def _params_per_elem(self, batch_size):
+        lam = np.ones(batch_size, dtype=np.float32)
+        use_cutmix = np.zeros(batch_size, dtype=np.bool)
+        if self.mixup_enabled:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand(batch_size) < self.switch_prob
+                lam_mix = np.where(
+                    use_cutmix,
+                    np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size),
+                    np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size))
+            elif self.mixup_alpha > 0.:
+                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size)
+            elif self.cutmix_alpha > 0.:
+                use_cutmix = np.ones(batch_size, dtype=np.bool)
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size)
+            else:
+                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
+            lam = np.where(np.random.rand(batch_size) < self.mix_prob, lam_mix.astype(np.float32), lam)
+        return lam, use_cutmix
+
+    def _params_per_batch(self):
+        lam = 1.
+        use_cutmix = False
+        if self.mixup_enabled and np.random.rand() < self.mix_prob:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand() < self.switch_prob
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \
+                    np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.mixup_alpha > 0.:
+                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.cutmix_alpha > 0.:
+                use_cutmix = True
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
+            else:
+                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
+            lam = float(lam_mix)
+        return lam, use_cutmix
+
+    def _mix_elem(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size)
+        x_orig = x.clone()  # need to keep an unmodified original for mixing source
+        for i in range(batch_size):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_pair(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        x_orig = x.clone()  # need to keep an unmodified original for mixing source
+        for i in range(batch_size // 2):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                    x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+                    x[j] = x[j] * lam + x_orig[i] * (1 - lam)
+        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
+        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_batch(self, x):
+        lam, use_cutmix = self._params_per_batch()
+        if lam == 1.:
+            return 1.
+        if use_cutmix:
+            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                x.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+            x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh]
+        else:
+            x_flipped = x.flip(0).mul_(1. - lam)
+            x.mul_(lam).add_(x_flipped)
+        return lam
+
+    def __call__(self, x, target):
+        assert len(x) % 2 == 0, 'Batch size should be even when using this'
+        if self.mode == 'elem':
+            lam = self._mix_elem(x)
+        elif self.mode == 'pair':
+            lam = self._mix_pair(x)
+        else:
+            lam = self._mix_batch(x)
+        target = mixup_target(target, self.num_classes, lam, self.label_smoothing)
+        return x, target
+
+
+class FastCollateMixup(Mixup):
+    """ Fast Collate w/ Mixup/Cutmix that applies different params to each element or whole batch
+
+    A Mixup impl that's performed while collating the batches.
+    """
+
+    def _mix_elem_collate(self, output, batch, half=False):
+        batch_size = len(batch)
+        num_elem = batch_size // 2 if half else batch_size
+        assert len(output) == num_elem
+        lam_batch, use_cutmix = self._params_per_elem(num_elem)
+        for i in range(num_elem):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            mixed = batch[i][0]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    if not half:
+                        mixed = mixed.copy()
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
+                    np.rint(mixed, out=mixed)
+            output[i] += torch.from_numpy(mixed.astype(np.uint8))
+        if half:
+            lam_batch = np.concatenate((lam_batch, np.ones(num_elem)))
+        return torch.tensor(lam_batch).unsqueeze(1)
+
+    def _mix_pair_collate(self, output, batch):
+        batch_size = len(batch)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        for i in range(batch_size // 2):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            mixed_i = batch[i][0]
+            mixed_j = batch[j][0]
+            assert 0 <= lam <= 1.0
+            if lam < 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    patch_i = mixed_i[:, yl:yh, xl:xh].copy()
+                    mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh]
+                    mixed_j[:, yl:yh, xl:xh] = patch_i
+                    lam_batch[i] = lam
+                else:
+                    mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam)
+                    mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam)
+                    mixed_i = mixed_temp
+                    np.rint(mixed_j, out=mixed_j)
+                    np.rint(mixed_i, out=mixed_i)
+            output[i] += torch.from_numpy(mixed_i.astype(np.uint8))
+            output[j] += torch.from_numpy(mixed_j.astype(np.uint8))
+        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
+        return torch.tensor(lam_batch).unsqueeze(1)
+
+    def _mix_batch_collate(self, output, batch):
+        batch_size = len(batch)
+        lam, use_cutmix = self._params_per_batch()
+        if use_cutmix:
+            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+        for i in range(batch_size):
+            j = batch_size - i - 1
+            mixed = batch[i][0]
+            if lam != 1.:
+                if use_cutmix:
+                    mixed = mixed.copy()  # don't want to modify the original while iterating
+                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
+                else:
+                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
+                    np.rint(mixed, out=mixed)
+            output[i] += torch.from_numpy(mixed.astype(np.uint8))
+        return lam
+
+    def __call__(self, batch, _=None):
+        batch_size = len(batch)
+        assert batch_size % 2 == 0, 'Batch size should be even when using this'
+        half = 'half' in self.mode
+        if half:
+            batch_size //= 2
+        output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
+        if self.mode == 'elem' or self.mode == 'half':
+            lam = self._mix_elem_collate(output, batch, half=half)
+        elif self.mode == 'pair':
+            lam = self._mix_pair_collate(output, batch)
+        else:
+            lam = self._mix_batch_collate(output, batch)
+        target = torch.tensor([b[1] for b in batch], dtype=torch.int64)
+        target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device='cpu')
+        target = target[:batch_size]
+        return output, target
+
diff --git a/object_detection/DETR/position_embedding.py b/object_detection/DETR/position_embedding.py
new file mode 100644
index 00000000..045e2ed8
--- /dev/null
+++ b/object_detection/DETR/position_embedding.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Positional embeddings, contains classes for sine-based and learning-based implementations.
+"""
+
+import copy
+import math
+import paddle
+import paddle.nn as nn
+
+
+class PositionEmbeddingSine(nn.Layer):
+    def __init__(self, num_pos_feats=64, temp=10000, norm=False, scale=None):
+        super(PositionEmbeddingSine, self).__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temp = temp
+        self.norm = norm
+        if scale is not None and norm is False:
+            raise ValueError('norm should be true is scale is passed')
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    
+    def forward(self, tensor_list):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        
+        #print('mask -----')
+        #for i  in range(mask.shape[0]):
+        #    for j in range(mask.shape[1]):
+        #        for k in range(mask.shape[2]):
+        #            print(int(mask[i, j, k].cpu().numpy()[0]), end=',')
+        #        print()
+        #    print('-----')
+
+        not_mask = (mask < 0.5).astype('float32')
+
+        y_embed = not_mask.cumsum(1, dtype='float32')
+        x_embed = not_mask.cumsum(2, dtype='float32')
+        
+        #print('-----y_embed')
+        #print(y_embed)
+
+        if self.norm:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = paddle.arange(self.num_pos_feats, dtype='int32') # paddle elementwise_floordiv support int32
+#TODO: check bug
+
+        dim_t = self.temp ** (2 * (dim_t // 2) / self.num_pos_feats) # int32 will cast to float32 
+
+        pos_x = x_embed.unsqueeze(-1) / dim_t
+        pos_y = y_embed.unsqueeze(-1) / dim_t
+
+        pos_x = paddle.stack((pos_x[:,:,:,0::2].sin(), pos_x[:,:,:,1::2].cos()), axis=4).flatten(3)
+        pos_y = paddle.stack((pos_y[:,:,:,0::2].sin(), pos_y[:,:,:,1::2].cos()), axis=4).flatten(3)
+        pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2])
+
+        #print('----- pos')
+        #print(pos)
+
+        return pos
+        
+
+class PositionEmbeddingLearned(nn.Layer):
+    def __init__(self, num_pos_feats=256):
+        super(PositionEmbeddingLearned, self).__init__()
+        w_attr1 = self._init_weights()
+        w_attr2 = self._init_weights()
+        self.row_embed = nn.Embedding(50, num_pos_feats, weight_attr=w_attr1) #TODO: why 50? maximum?
+        self.col_embed = nn.Embedding(50, num_pos_feats, weight_attr=w_attr2)
+
+    def _init_weights(self):
+        return paddle.ParamAttr(initializer=nn.initializer.Uniform(low=0., high=1.))
+
+    def forward(self, tensor_list):
+        x = tensor_list.tensors # [batch, 2048(R50 feat), H, W]
+        h, w = x.shape[-2:]
+        i = paddle.arange(w)
+        j = paddle.arange(h)
+        x_embed = self.col_embed(i)
+        y_embed = self.row_embed(j)
+
+        #print('x_embed, ', x_embed.shape)
+        #print('y_embed, ', y_embed.shape)
+    
+        pos = paddle.concat([
+            x_embed.unsqueeze(0).expand((h, x_embed.shape[0], x_embed.shape[1])),
+            y_embed.unsqueeze(1).expand((y_embed.shape[0], w, y_embed.shape[1])),
+            ], axis=-1)
+        #print(pos.shape)
+        pos = pos.transpose([2, 0, 1]) # [dim, h, w]
+        pos = pos.unsqueeze(0) # [1, dim, h, w]
+        pos = pos.expand([x.shape[0]] + pos.shape[1::]) # [batch_size, dim, h, w]
+
+        return pos
+
+
+def build_position_encoding(hidden_dim=256, mode='sine'):
+    N_steps = hidden_dim // 2 
+    if mode == 'sine':
+        position_embedding = PositionEmbeddingSine(N_steps, norm=True)
+    elif mode == 'learned':
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f'{mode} not supported')
+    return position_embedding
diff --git a/object_detection/DETR/random_erasing.py b/object_detection/DETR/random_erasing.py
new file mode 100644
index 00000000..a3f7d3b5
--- /dev/null
+++ b/object_detection/DETR/random_erasing.py
@@ -0,0 +1,108 @@
+import random
+import math
+import paddle
+
+
+def _get_pixels(per_pixel, rand_color, patch_size, dtype="float32"):
+    if per_pixel:
+        return paddle.normal(shape=patch_size).astype(dtype)
+    elif rand_color:
+        return paddle.normal(shape=(patch_size[0], 1, 1)).astype(dtype)
+    else:
+        return paddle.zeros((patch_size[0], 1, 1)).astype(dtype)
+
+class RandomErasing(object):
+    """
+    Args:
+        prob: probability of performing random erasing
+        min_area: Minimum percentage of erased area wrt input image area
+        max_area: Maximum percentage of erased area wrt input image area
+        min_aspect: Minimum aspect ratio of earsed area
+        max_aspect: Maximum aspect ratio of earsed area
+        mode: pixel color mode, in ['const', 'rand', 'pixel']
+            'const' - erase block is constant valued 0 for all channels
+            'rand'  - erase block is valued random color (same per-channel)  
+            'pixel' - erase block is vauled random color per pixel
+        min_count: Minimum # of ereasing blocks per image.
+        max_count: Maximum # of ereasing blocks per image. Area per box is scaled by count
+                   per-image count is randomly chosen between min_count to max_count
+    """
+    def __init__(self, prob=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None,
+                 mode='const', min_count=1, max_count=None, num_splits=0):
+        self.prob = prob
+        self.min_area = min_area
+        self.max_area = max_area
+        max_aspect = max_aspect or 1 / min_aspect
+        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+        self.min_count = min_count
+        self.max_count = max_count or min_count
+        self.num_splits = num_splits
+        mode = mode.lower()
+        self.rand_color = False
+        self.per_pixel = False
+        if mode == "rand":
+            self.rand_color = True
+        elif mode == "pixel":
+            self.per_pixel = True
+        else:
+            assert not mode or mode == "const"
+
+    def _erase(self, img, chan, img_h, img_w, dtype):
+        if random.random() > self.prob:
+            return
+        area = img_h * img_w
+        count = self.min_count if self.min_count == self.max_count else \
+            random.randint(self.min_count, self.max_count)
+        for _ in range(count):
+            for attempt in range(10):
+                target_area = random.uniform(self.min_area, self.max_area) * area / count
+                aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+                h = int(round(math.sqrt(target_area * aspect_ratio)))
+                w = int(round(math.sqrt(target_area / aspect_ratio)))
+                #print(h, w)
+                if w < img_w and h < img_h:
+                    top = random.randint(0, img_h - h)
+                    left = random.randint(0, img_w - w)
+                    #print(top, left)
+
+                    img[:, top:top+h, left:left+w] = _get_pixels(
+                                self.per_pixel, self.rand_color, (chan, h, w),
+                                dtype=dtype)
+                    #print(_get_pixels(
+                    #            self.per_pixel, self.rand_color, (chan, h, w),
+                    #            dtype=dtype))
+                    break
+    
+    def __call__(self, input):
+        if len(input.shape) == 3:
+            self._erase(input, *input.shape, input.dtype)
+        else:
+            batch_size, chan, img_h, img_w = input.shape
+            batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0
+            for i in range(batch_start, batch_size):
+                self._erase(input[i], chan, img_h, img_w, input.dtype)
+        return input
+
+
+
+def main():
+    re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='rand')
+    #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='const')
+    #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='pixel')
+    import PIL.Image as Image
+    import numpy as np
+    paddle.set_device('cpu')
+    img = paddle.to_tensor(np.asarray(Image.open('./lenna.png'))).astype('float32')
+    img = img / 255.0
+    img = paddle.transpose(img, [2, 0, 1])
+    new_img = re(img)
+    new_img = new_img * 255.0
+    new_img = paddle.transpose(new_img, [1, 2, 0])
+    new_img = new_img.cpu().numpy()
+    new_img = Image.fromarray(new_img.astype('uint8'))
+    new_img.save('./res.png')
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/object_detection/DETR/resnet.py b/object_detection/DETR/resnet.py
new file mode 100644
index 00000000..056dd46e
--- /dev/null
+++ b/object_detection/DETR/resnet.py
@@ -0,0 +1,265 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" ResNet, add new features that allows changing dilation, and change norm layers,
+Mostly refered: https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/vision/models/resnet.py
+"""
+
+import paddle
+import paddle.nn as nn
+
+from paddle.utils.download import get_weights_path_from_url
+
+__all__ = [
+    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'
+]
+
+model_urls = {
+    'resnet18': ('https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams',
+                 'cf548f46534aa3560945be4b95cd11c4'),
+    'resnet34': ('https://paddle-hapi.bj.bcebos.com/models/resnet34.pdparams',
+                 '8d2275cf8706028345f78ac0e1d31969'),
+    'resnet50': ('https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams',
+                 'ca6f485ee1ab0492d38f323885b0ad80'),
+    'resnet101': ('https://paddle-hapi.bj.bcebos.com/models/resnet101.pdparams',
+                  '02f35f034ca3858e1e54d4036443c92d'),
+    'resnet152': ('https://paddle-hapi.bj.bcebos.com/models/resnet152.pdparams',
+                  '7ad16a2f1e7333859ff986138630fd7a'),
+}
+
+
+
+class BasicBlock(nn.Layer):
+    expansion = 1
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if dilation > 1:
+            raise ValueError('Basic block does not support dilation')
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2D
+        self.conv1 = nn.Conv2D(
+            inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2D(
+            planes, planes, 3, padding=1, bias_attr=False)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class BottleneckBlock(nn.Layer):
+    expansion = 4
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(BottleneckBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2D
+        width = int(planes * (base_width / 64.)) * groups
+        self.conv1 = nn.Conv2D(inplanes, width, 1, bias_attr=False)
+        self.bn1 = norm_layer(width)
+
+        self.conv2 = nn.Conv2D(width,
+                               width,
+                               3,
+                               padding=dilation,
+                               stride=stride,
+                               groups=groups,
+                               dilation=dilation,
+                               bias_attr=False)
+        self.bn2 = norm_layer(width)
+        self.conv3 = nn.Conv2D(width, planes * self.expansion, 1, bias_attr=False)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Layer):
+    def __init__(self,
+                 block,
+                 depth,
+                 num_classes=1000,
+                 with_pool=True,
+                 norm_layer=None,
+                 replace_stride_with_dilation=None,
+                 dilation=1):
+        super(ResNet, self).__init__()
+        layer_cfg = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3]
+        }
+    
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError('replace_stride_with_dilation shoule be None or 3-element tuple')
+
+        layers = layer_cfg[depth]
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2D
+        self._norm_layer = norm_layer
+
+
+        self.inplanes = 64
+        self.dilation = dilation
+
+        self.conv1 = nn.Conv2D(
+            3,
+            self.inplanes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias_attr=False)
+        self.bn1 = self._norm_layer(self.inplanes)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        if with_pool:
+            self.avgpool = nn.AdaptiveAvgPool2D((1,1))
+        if num_classes > 0:
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride =1
+        if stride !=1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes,
+                          planes*block.expansion,
+                          1,
+                          stride=stride,
+                          bias_attr=False),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, 1, 64,
+                  previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, norm_layer=norm_layer))
+
+        return nn.Sequential(*layers) 
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        if self.with_pool:
+            x = self.avgpool(x)
+        if self.num_classes > 0:
+            x = paddle.flatten(x, 1)
+            x = self.fc(x)
+
+        return x
+
+def _resnet(arch, Block, depth, pretrained, **kwargs):
+    model = ResNet(Block, depth, **kwargs)
+    if pretrained:
+        assert arch in model_urls, f"{arch} model do not have a pretrained model now"
+        weight_path = get_weights_path_from_url(model_urls[arch][0],
+                                                model_urls[arch][1])
+        param = paddle.load(weight_path)
+        model.set_dict(param)
+    return model
+
+
+def resnet18(pretrained=False, **kwargs):
+    return _resnet('resnet18', BasicBlock, 18, pretrained, **kwargs)
+
+def resnet34(pretrained=False, **kwargs):
+    return _resnet('resnet34', BasicBlock, 34, pretrained, **kwargs)
+
+def resnet50(pretrained=False, **kwargs):
+    return _resnet('resnet50', BottleneckBlock, 50, pretrained, **kwargs)
+        
+def resnet101(pretrained=False, **kwargs):
+    return _resnet('resnet101', BottleneckBlock, 101, pretrained, **kwargs)
+
+def resnet152(pretrained=False, **kwargs):
+    return _resnet('resnet152', BottleneckBlock, 152, pretrained, **kwargs)
diff --git a/object_detection/DETR/run_eval.sh b/object_detection/DETR/run_eval.sh
new file mode 100644
index 00000000..a9e36aa5
--- /dev/null
+++ b/object_detection/DETR/run_eval.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/detr_resnet50.yaml' \
+-dataset='coco' \
+-batch_size=2 \
+-data_path='/dataset/coco' \
+-eval \
+-pretrained='./detr_resnet50'
diff --git a/object_detection/DETR/run_eval_multi.sh b/object_detection/DETR/run_eval_multi.sh
new file mode 100644
index 00000000..52b84627
--- /dev/null
+++ b/object_detection/DETR/run_eval_multi.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/detr_resnet50.yaml' \
+-dataset='coco' \
+-batch_size=2 \
+-data_path='/dataset/coco' \
+-eval \
+-pretrained='./detr_resnet50'
diff --git a/object_detection/DETR/run_eval_r101.sh b/object_detection/DETR/run_eval_r101.sh
new file mode 100644
index 00000000..2d729434
--- /dev/null
+++ b/object_detection/DETR/run_eval_r101.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=7 \
+python main_single_gpu.py \
+-cfg='./configs/detr_resnet101.yaml' \
+-dataset='coco' \
+-batch_size=2 \
+-data_path='/dataset/coco' \
+-eval \
+-pretrained='./detr_resnet101'
diff --git a/object_detection/DETR/run_eval_r101_multi.sh b/object_detection/DETR/run_eval_r101_multi.sh
new file mode 100644
index 00000000..3b56b196
--- /dev/null
+++ b/object_detection/DETR/run_eval_r101_multi.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=4,5,6,7 \
+python main_multi_gpu.py \
+-cfg='./configs/detr_resnet101.yaml' \
+-dataset='coco' \
+-batch_size=2 \
+-data_path='/dataset/coco' \
+-eval \
+-pretrained='./detr_resnet101'
diff --git a/object_detection/DETR/segmentation.py b/object_detection/DETR/segmentation.py
new file mode 100644
index 00000000..77010757
--- /dev/null
+++ b/object_detection/DETR/segmentation.py
@@ -0,0 +1,24 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+# TODO: add related classes and methods for segmentations
+def dice_loss(inputs, targets, num_boxes):
+    inputs = F.sigmoid(inputs)
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominaror = inputs.sum(-1) + target.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator +1)
+    return loss.sum() / num_boxes
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha=.25, gamma=2.):
+    prob = F.sigmoid(inputs)
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
diff --git a/object_detection/DETR/tests/__init__.py b/object_detection/DETR/tests/__init__.py
new file mode 100644
index 00000000..00bcb6e3
--- /dev/null
+++ b/object_detection/DETR/tests/__init__.py
@@ -0,0 +1 @@
+# test
\ No newline at end of file
diff --git a/object_detection/DETR/tests/box.png b/object_detection/DETR/tests/box.png
new file mode 100644
index 00000000..0201de49
Binary files /dev/null and b/object_detection/DETR/tests/box.png differ
diff --git a/object_detection/DETR/tests/draw_box.py b/object_detection/DETR/tests/draw_box.py
new file mode 100644
index 00000000..1e145d38
--- /dev/null
+++ b/object_detection/DETR/tests/draw_box.py
@@ -0,0 +1,31 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from PIL import Image, ImageDraw
+
+im = np.ones([600, 800, 3])* 255.0
+im = im.astype(np.uint8)
+im = Image.fromarray(im)
+
+draw = ImageDraw.Draw(im)
+
+boxes = [[100, 100, 300, 400],
+         [120, 160, 230, 340],
+         [200, 320, 500, 580],
+         [400, 450, 700, 550],
+         [450, 80, 580, 210]]
+color = ['red','blue','magenta','green', 'gold']
+xy = [[180, 120],[150, 160],[350, 400],[600, 500],[500, 110]]
+
+
+for idx, box in enumerate(boxes):
+    draw.rectangle(box, fill=None, outline=color[idx], width=4)
+    draw.text((box[0],box[1]), f'{box[0]},{box[1]}', fill=color[idx])
+    draw.text((box[2],box[3]), f'{box[2]},{box[3]}', fill=color[idx])
+
+    draw.text((xy[idx][0], xy[idx][1]), f'{idx+1}', fill=color[idx])
+
+im.save('box.png')
+
+
+
diff --git a/object_detection/DETR/tests/test_box_ops.py b/object_detection/DETR/tests/test_box_ops.py
new file mode 100644
index 00000000..c0350346
--- /dev/null
+++ b/object_detection/DETR/tests/test_box_ops.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.:
+
+
+
+import unittest
+import paddle
+import numpy as np
+from box_ops import *
+from utils import NestedTensor
+
+
+class BoxTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    #@unittest.skip('skip fo debug')
+    def test_box_cxcyhw_to_xyxy(self):
+        box = [120, 60, 40, 50]
+        box = paddle.to_tensor(box)
+
+        new_box = box_cxcywh_to_xyxy(box)
+        new_box = new_box.numpy().tolist()
+        self.assertEqual(new_box, [100, 35, 140, 85])
+
+    #@unittest.skip('skip fo debug')
+    def test_box_xyxy_to_cxcyhw(self):
+        box = [100, 35, 140, 85]
+        box = paddle.to_tensor(box)
+
+        new_box = box_xyxy_to_cxcywh(box)
+        new_box = new_box.numpy().tolist()
+        self.assertEqual(new_box, [120, 60, 40, 50])
+
+    #@unittest.skip('skip fo debug')
+    def test_box_area(self):
+        box = [[100, 35, 140, 85], [10, 30, 20, 100]]
+        box = paddle.to_tensor(box)
+        area = box_area(box)
+        self.assertEqual(area[0], 2000)
+        self.assertEqual(area[1], 700)
+
+    #@unittest.skip('skip fo debug')
+    def test_box_iou(self):
+        boxes = [[100, 100, 300, 400],
+                 [120, 160, 230, 340],
+                 [200, 320, 500, 580],
+                 [400, 450, 700, 550],
+                 [450, 80, 580, 210]]
+        boxes = paddle.to_tensor(boxes).astype('float32')
+        iou, union = box_iou(boxes, boxes)
+        #print(iou)
+        #print(union)
+
+        self.assertEqual(union[0][0], 60000) # area of box1
+        self.assertEqual(union[1][1], 19800) # area of box2
+        self.assertEqual(union[2][2], 78000) # area of box3
+        self.assertEqual(union[3][3], 30000) # area of box4
+        self.assertEqual(union[4][4], 16900) # area of box5
+
+        self.assertEqual(union[0][1], 60000) # box2 in box1: res=area of box1
+        self.assertEqual(union[0][2], 130000) # area of box1 + box3  - overlap(80*100) 
+        self.assertEqual(union[0][3], 90000) # no overlap, area box1 + box4
+        self.assertEqual(union[0][4], 76900) # no overlap, area box1 + box5
+
+
+        self.assertAlmostEqual(iou[0][1], 0.33, 4)
+        self.assertAlmostEqual(iou[0][2], 8000/130000, 4)
+        self.assertAlmostEqual(iou[0][3], 0, 4)
+        self.assertAlmostEqual(iou[0][4], 0, 4)
+        
+
+    #@unittest.skip('skip fo debug')
+    def test_generalized_box_iou(self):
+        boxes = [[100, 100, 300, 400],
+                 [120, 160, 230, 340],
+                 [200, 320, 500, 580],
+                 [400, 450, 700, 550],
+                 [450, 80, 580, 210]]
+        boxes = paddle.to_tensor(boxes).astype('float32')
+        giou = generalized_box_iou(boxes, boxes)
+        #print(giou)
+
+        self.assertAlmostEqual(giou[0][0], 1, 4)
+        self.assertAlmostEqual(giou[0][1], 0.33, 4)
+        self.assertAlmostEqual(giou[0][2].numpy()[0], -0.2613, 3)
+        self.assertAlmostEqual(giou[0][3].numpy()[0], -0.6666, 3)
+        self.assertAlmostEqual(giou[0][4].numpy()[0], -0.4993, 3)
+    
+    #@unittest.skip('skip fo debug')
+    def test_masks_to_boxes(self):
+        masks = paddle.ones([1, 50, 50])
+        masks[:, 0:20, :] = 0 
+        masks[:, 45::, :] = 0 
+        masks[:, :, 0:10] = 0 
+        masks[:, :, 49::] = 0 
+
+        boxes = masks_to_boxes(masks)
+        self.assertEqual(boxes[0].numpy()[0], 10)
+        self.assertEqual(boxes[0].numpy()[1], 20)
+        self.assertEqual(boxes[0].numpy()[2], 48)
+        self.assertEqual(boxes[0].numpy()[3], 44)
diff --git a/object_detection/DETR/tests/test_coco.py b/object_detection/DETR/tests/test_coco.py
new file mode 100644
index 00000000..bddeddbd
--- /dev/null
+++ b/object_detection/DETR/tests/test_coco.py
@@ -0,0 +1,148 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.:
+
+import unittest
+import os
+import paddle
+import numpy as np
+import PIL
+from PIL import Image, ImageDraw, ImageFont
+from coco import build_coco
+from coco import make_coco_transforms
+from coco import CocoDetection
+from box_ops import box_cxcywh_to_xyxy
+from pycocotools.coco import COCO
+
+class CocoTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        #cls.data_path = '/dataset/coco/'
+        #cls.im_mean = np.array([0.485, 0.456, 0.406])
+        #cls.im_std = np.array([0.229, 0.224, 0.225]) 
+        #cls.coco_dataset_train_det = build_coco('train', CocoTest.data_path, False)
+        #cls.coco_dataset_val_det = build_coco('val', CocoTest.data_path, False)
+        #cls.coco_dataset_train_det_mask = build_coco('train', CocoTest.data_path, True)
+        #cls.coco_dataset_val_det_mask = build_coco('val', CocoTest.data_path, True)
+        #cls.cat_train = cls.coco_dataset_train_det.coco.dataset['categories']
+        #cls.cat_val = cls.coco_dataset_val_det.coco.dataset['categories']
+        #cls.fnt = ImageFont.truetype("./FreeMono.ttf", 20)
+        #cls.out = './tmp_out'
+        #if not os.path.exists(cls.out):
+        #    os.mkdir(cls.out)
+        #cls.colors = ['blue','orange','green','red','purple','brown','pink','gray','olive','cyan']
+        pass
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    @unittest.skip('skip for debug')
+    def test_build_coco_train_det_cpu(self):
+        paddle.set_device('cpu')
+        self._test_build_coco_det(CocoTest.coco_dataset_train_det, 'train')
+
+    @unittest.skip('skip for debug')
+    def test_build_coco_train_det_gpu(self):
+        paddle.set_device('gpu')
+        self._test_build_coco_det(CocoTest.coco_dataset_train_det, 'train')
+
+    @unittest.skip('skip for debug')
+    def test_build_coco_train_det_mask_cpu(self):
+        paddle.set_device('cpu')
+        self._test_build_coco_det_mask(CocoTest.coco_dataset_train_det_mask, 'train')
+
+    @unittest.skip('skip for debug')
+    def test_build_coco_train_det_mask_gpu(self):
+        paddle.set_device('gpu')
+        self._test_build_coco_det_mask(CocoTest.coco_dataset_train_det_mask, 'train')
+
+    @unittest.skip('skip for debug')
+    def test_build_coco_val_det_cpu(self):
+        paddle.set_device('cpu')
+        self._test_build_coco_det(CocoTest.coco_dataset_val_det, 'val')
+
+    @unittest.skip('skip for debug')
+    def test_build_coco_val_det_gpu(self):
+        paddle.set_device('gpu')
+        self._test_build_coco_det(CocoTest.coco_dataset_val_det, 'val')
+
+    @unittest.skip('skip for debug')
+    def test_build_coco_val_det_mask_cpu(self):
+        paddle.set_device('cpu')
+        self._test_build_coco_det_mask(CocoTest.coco_dataset_val_det_mask, 'val')
+
+    @unittest.skip('skip for debug')
+    def test_build_coco_val_det_mask_gpu(self):
+        paddle.set_device('gpu')
+        self._test_build_coco_det_mask(CocoTest.coco_dataset_val_det_mask, 'val')
+
+    def _test_build_coco_det_mask(self, coco_dataset, mode):
+        for idx, (image, target) in enumerate(coco_dataset):
+            if 'masks' in target:
+                masks = target['masks'].cpu().numpy()  # [N, H, W]
+                if np.any(masks):
+                    print('saving masks into png')
+                    for i in range(masks.shape[0]):
+                        mask = masks[i, :, :] * 255.0
+                        mask = mask.astype('uint8')
+                        im = Image.fromarray(mask)
+                        im.save(os.path.join(CocoTest.out, f'mask_{mode}_{idx}_{i}_{paddle.get_device()}.png'))
+
+                    # save image
+                    image = image.transpose([1, 2, 0])  # [C, H, W]
+                    image = image.cpu().numpy()
+                    image = (image * CocoTest.im_std + CocoTest.im_mean) * 255.0
+                    image = image.astype('uint8')
+                    im = Image.fromarray(image)
+                    im.save(os.path.join(CocoTest.out, f'img_mask_{mode}_{idx}_from_{paddle.get_device()}.png'))
+                    break
+                else:
+                    print('no masks in curren image, continue')
+                    continue
+            else:
+                print('no masks in curren image, continue')
+                continue
+
+    def _test_build_coco_det(self, coco_dataset, mode):
+        def get_cat_name(id, cat):
+            for item in cat: 
+                if item['id'] == id:
+                    return item['name']
+            return ""
+        # used to recover image
+        for idx, (image, target) in enumerate(coco_dataset):
+            # recover and save image to file, for manual check
+            image = image.transpose([1, 2, 0])  # [C, H, W]
+            image = image.cpu().numpy()
+            image = (image * CocoTest.im_std + CocoTest.im_mean) * 255.0
+            image = image.astype('uint8')
+            im = Image.fromarray(image)
+            # get bbox labels
+            labels = target['labels'].cpu().numpy()
+            # draw bbox on image
+            h, w = image.shape[0], image.shape[1]
+            boxes = target['boxes']
+            boxes = boxes * paddle.to_tensor([w, h, w, h])
+            boxes = box_cxcywh_to_xyxy(boxes)
+            boxes = boxes.cpu().numpy()  # [N, 4]
+            im1 = ImageDraw.Draw(im)
+            for i in range(boxes.shape[0]):
+                box = boxes[i].astype('int32')
+                box = [(box[0], box[1]), (box[2], box[3])]
+                im1.rectangle(box, outline=CocoTest.colors[i % len(CocoTest.colors)], width=5)
+                im1.text(box[0], get_cat_name(labels[i], CocoTest.cat_val), font=CocoTest.fnt, fill='red')
+            im.save(os.path.join(CocoTest.out, f'img_{mode}_{idx}_from_{paddle.get_device()}.png'))
+            if idx >= 5:
+                break
+
diff --git a/object_detection/DETR/tests/test_detr.py b/object_detection/DETR/tests/test_detr.py
new file mode 100644
index 00000000..e2d9559d
--- /dev/null
+++ b/object_detection/DETR/tests/test_detr.py
@@ -0,0 +1,238 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.:
+
+import unittest
+import os
+import paddle
+import numpy as np
+import PIL
+from PIL import Image, ImageDraw, ImageFont
+from coco import build_coco
+from coco import make_coco_transforms
+from coco import CocoDetection
+from box_ops import box_cxcywh_to_xyxy
+from pycocotools.coco import COCO
+from utils import collate_fn
+from utils import NestedTensor
+from detr import build_detr
+
+
+class DetrTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        #cls.data_path = '/dataset/coco/'
+        #cls.coco_dataset_train = build_coco('train', cls.data_path, False)
+
+        #cls.im_mean = np.array([0.485, 0.456, 0.406])
+        #cls.im_std = np.array([0.229, 0.224, 0.225]) 
+        #cls.coco_dataset_val_det = build_coco('val', CocoTest.data_path, False)
+        #cls.coco_dataset_train_det_mask = build_coco('train', CocoTest.data_path, True)
+        #cls.coco_dataset_val_det_mask = build_coco('val', CocoTest.data_path, True)
+        #cls.cat_train = cls.coco_dataset_train_det.coco.dataset['categories']
+        #cls.cat_val = cls.coco_dataset_val_det.coco.dataset['categories']
+        #cls.fnt = ImageFont.truetype("./FreeMono.ttf", 20)
+        #cls.out = './tmp_out'
+        #if not os.path.exists(cls.out):
+        #    os.mkdir(cls.out)
+        #cls.colors = ['blue','orange','green','red','purple','brown','pink','gray','olive','cyan']
+
+        with open('./t.npy', 'rb') as infile:
+            t = np.load(infile, allow_pickle=True)
+            m = np.load(infile, allow_pickle=True)
+            tar = np.load(infile, allow_pickle=True)
+        #tt = torch.Tensor(t)
+        #mm = torch.Tensor(m)
+        #th_in = th_utils.NestedTensor(tt, mm)
+
+        # targets
+        targets = []
+        for ta in tar:
+            target = dict()
+            for key, val in ta.items():
+                target[key] = paddle.to_tensor(val)
+            targets.append(target)
+        targets = tuple(targets)
+
+        
+        ttt = paddle.to_tensor(t)
+        mmm = paddle.to_tensor(m)
+        pp_in = NestedTensor(ttt, mmm)
+        
+        #print(th_in, th_in.tensors.shape)
+        #print(pp_in, pp_in.tensors.shape)
+        #print(targets)
+
+        #cls.th_in = th_in
+        cls.pp_in = pp_in
+        cls.pp_gt = targets
+
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    @unittest.skip('skip for debug')
+    def test_build_detr_cpu(self):
+        paddle.set_device('cpu')
+        model, criterion, postprocessors = build_detr()
+        model.eval()
+
+        #sampler_train = paddle.io.BatchSampler(CocoTest.coco_dataset_train,
+        #                                       batch_size=4,
+        #                                       shuffle=False)
+        #dataloader_train = paddle.io.DataLoader(CocoTest.coco_dataset_train,
+        #                                        batch_sampler=sampler_train,
+        #                                        collate_fn=collate_fn)
+
+        #for idx, batch_data in enumerate(dataloader_train):
+        #    samples, targets = batch_data[0], batch_data[1]
+
+        #    print('=================================')
+        #    print(samples)
+        #    print(targets)
+        #    paddle.save({'tensors':samples.tensors, 'mask': samples.mask}, path='./batch_samples_01.pdtensor', protocol=2)
+        #    paddle.save(targets, path='./batch_targets_01.pdtensor', protocol=2)
+        #    print('=================================')
+        #    break
+
+        ###    print('----- mask shape = ')
+        ###    print(samples.mask.shape)
+        ###    print('----- samples shape=')
+        ###    print(samples.tensors.shape)
+        ###    out = model(samples)
+        ###    print(out)
+
+        #samples = paddle.load(path='./batch_samples_01.pdtensor')
+        #samples = NestedTensor(samples['tensors'], samples['mask'])
+        #print(samples.tensors.shape)
+        #targets = paddle.load(path='./batch_targets_01.pdtensor')
+
+        out = model(DetrTest.pp_in)
+        #print(out)  
+        #print(out.keys()) # pred_logits[2, 100, 92], pred_boxes: [2, 100, 4], aux_outputs
+
+        # test criterion
+        losses = criterion(out, DetrTest.pp_gt)
+        #print(losses)
+        
+        return
+
+        ## test postprocess
+        target_sizes = [
+                        [768, 1027],
+                        [768, 1027],
+                        [768, 1027],
+                        [768, 1027],
+                        ] 
+        target_sizes = paddle.to_tensor(target_sizes)
+        res = postprocessors(outputs, target_size=target_size)
+        print(res)
+
+
+
+
+    #@unittest.skip('skip for debug')
+    #def test_build_coco_train_det_gpu(self):
+    #    paddle.set_device('gpu')
+    #    self._test_build_coco_det(CocoTest.coco_dataset_train_det, 'train')
+
+    #@unittest.skip('skip for debug')
+    #def test_build_coco_train_det_mask_cpu(self):
+    #    paddle.set_device('cpu')
+    #    self._test_build_coco_det_mask(CocoTest.coco_dataset_train_det_mask, 'train')
+
+    #@unittest.skip('skip for debug')
+    #def test_build_coco_train_det_mask_gpu(self):
+    #    paddle.set_device('gpu')
+    #    self._test_build_coco_det_mask(CocoTest.coco_dataset_train_det_mask, 'train')
+
+    #@unittest.skip('skip for debug')
+    #def test_build_coco_val_det_cpu(self):
+    #    paddle.set_device('cpu')
+    #    self._test_build_coco_det(CocoTest.coco_dataset_val_det, 'val')
+
+    #@unittest.skip('skip for debug')
+    #def test_build_coco_val_det_gpu(self):
+    #    paddle.set_device('gpu')
+    #    self._test_build_coco_det(CocoTest.coco_dataset_val_det, 'val')
+
+    #@unittest.skip('skip for debug')
+    #def test_build_coco_val_det_mask_cpu(self):
+    #    paddle.set_device('cpu')
+    #    self._test_build_coco_det_mask(CocoTest.coco_dataset_val_det_mask, 'val')
+
+    #@unittest.skip('skip for debug')
+    #def test_build_coco_val_det_mask_gpu(self):
+    #    paddle.set_device('gpu')
+    #    self._test_build_coco_det_mask(CocoTest.coco_dataset_val_det_mask, 'val')
+
+    #def _test_build_coco_det_mask(self, coco_dataset, mode):
+    #    for idx, (image, target) in enumerate(coco_dataset):
+    #        if 'masks' in target:
+    #            masks = target['masks'].cpu().numpy()  # [N, H, W]
+    #            if np.any(masks):
+    #                print('saving masks into png')
+    #                for i in range(masks.shape[0]):
+    #                    mask = masks[i, :, :] * 255.0
+    #                    mask = mask.astype('uint8')
+    #                    im = Image.fromarray(mask)
+    #                    im.save(os.path.join(CocoTest.out, f'mask_{mode}_{idx}_{i}_{paddle.get_device()}.png'))
+
+    #                # save image
+    #                image = image.transpose([1, 2, 0])  # [C, H, W]
+    #                image = image.cpu().numpy()
+    #                image = (image * CocoTest.im_std + CocoTest.im_mean) * 255.0
+    #                image = image.astype('uint8')
+    #                im = Image.fromarray(image)
+    #                im.save(os.path.join(CocoTest.out, f'img_mask_{mode}_{idx}_from_{paddle.get_device()}.png'))
+    #                break
+    #            else:
+    #                print('no masks in curren image, continue')
+    #                continue
+    #        else:
+    #            print('no masks in curren image, continue')
+    #            continue
+
+    #def _test_build_coco_det(self, coco_dataset, mode):
+    #    def get_cat_name(id, cat):
+    #        for item in cat: 
+    #            if item['id'] == id:
+    #                return item['name']
+    #        return ""
+    #    # used to recover image
+    #    for idx, (image, target) in enumerate(coco_dataset):
+    #        # recover and save image to file, for manual check
+    #        image = image.transpose([1, 2, 0])  # [C, H, W]
+    #        image = image.cpu().numpy()
+    #        image = (image * CocoTest.im_std + CocoTest.im_mean) * 255.0
+    #        image = image.astype('uint8')
+    #        im = Image.fromarray(image)
+    #        # get bbox labels
+    #        labels = target['labels'].cpu().numpy()
+    #        # draw bbox on image
+    #        h, w = image.shape[0], image.shape[1]
+    #        boxes = target['boxes']
+    #        boxes = boxes * paddle.to_tensor([w, h, w, h])
+    #        boxes = box_cxcywh_to_xyxy(boxes)
+    #        boxes = boxes.cpu().numpy()  # [N, 4]
+    #        im1 = ImageDraw.Draw(im)
+    #        for i in range(boxes.shape[0]):
+    #            box = boxes[i].astype('int32')
+    #            box = [(box[0], box[1]), (box[2], box[3])]
+    #            im1.rectangle(box, outline=CocoTest.colors[i % len(CocoTest.colors)], width=5)
+    #            im1.text(box[0], get_cat_name(labels[i], CocoTest.cat_val), font=CocoTest.fnt, fill='red')
+    #        im.save(os.path.join(CocoTest.out, f'img_{mode}_{idx}_from_{paddle.get_device()}.png'))
+    #        if idx >= 5:
+    #            break
+
diff --git a/object_detection/DETR/tests/test_resnet.py b/object_detection/DETR/tests/test_resnet.py
new file mode 100644
index 00000000..f41b2174
--- /dev/null
+++ b/object_detection/DETR/tests/test_resnet.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.:
+
+import unittest
+import paddle
+import numpy as np
+from resnet import resnet50 as myresnet50
+from paddle.vision.models import resnet50
+from resnet import resnet18 as myresnet18
+from paddle.vision.models import resnet18
+
+from backbone import FrozenBatchNorm2D
+from backbone import IntermediateLayerGetter
+
+
+class ResnetTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+        cls.dummy_img = np.random.randn(1, 3, 224, 224).astype('float32')
+        cls.dummy_tensor = paddle.to_tensor(cls.dummy_img)
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+
+    @unittest.skip('skip for debug')
+    def test_resnet50(self):
+        model = resnet50(pretrained=True) 
+        mymodel = myresnet50(pretrained=True)
+
+        out = model(ResnetTest.dummy_tensor)
+        myout = mymodel(ResnetTest.dummy_tensor)
+    
+        out = out.cpu().numpy()
+        myout = myout.cpu().numpy()
+
+        self.assertTrue(np.allclose(out, myout))
+
+    @unittest.skip('skip for debug')
+    def test_resnet18(self):
+        model = resnet18(pretrained=True) 
+        mymodel = myresnet18(pretrained=True)
+
+        out = model(ResnetTest.dummy_tensor)
+        myout = mymodel(ResnetTest.dummy_tensor)
+    
+        out = out.cpu().numpy()
+        myout = myout.cpu().numpy()
+
+
+        self.assertTrue(np.allclose(out, myout))
+
+
+
+
+    @unittest.skip('skip for debug')
+    def test_frozen_bn(self):
+        model = resnet18(pretrained=True) 
+        bn1 = model.bn1
+        bn1_st = bn1.state_dict()
+        bn1.eval()
+
+        frozen_bn = FrozenBatchNorm2D(64)
+        frozen_bn.set_state_dict(bn1_st)
+
+        tmp = paddle.randn([4, 64, 5, 5])
+        out = bn1(tmp)
+        out_f = frozen_bn(tmp)
+
+        self.assertTrue([4, 64, 5, 5], out_f.shape)
+
+        out = out.cpu().numpy()
+        out_f = out_f.cpu().numpy()
+
+        self.assertTrue(np.allclose(out, out_f, atol=1e-5))
+
+
+
+
+    @unittest.skip('skip for debug')
+    def test_intermediate_layer_getter(self):
+        model = resnet50(pretrained=True)
+        return_layers = {'layer1': '0', 'layer2': '1', 'layer3': '2', 'layer4': '3'}
+        new_model = IntermediateLayerGetter(model, return_layers)
+        tmp = paddle.randn([1, 3, 224, 224])
+        out = new_model(tmp)
+        #print([(k, v.shape) for k,v in out.items()])
+    
+        self.assertEqual(out['0'].shape, [1, 256, 56, 56])
+        self.assertEqual(out['1'].shape, [1, 512, 28, 28])
+        self.assertEqual(out['2'].shape, [1, 1024, 14, 14])
+        self.assertEqual(out['3'].shape, [1, 2048, 7, 7])
+
+
+
+
+
diff --git a/object_detection/DETR/tests/test_transformer.py b/object_detection/DETR/tests/test_transformer.py
new file mode 100644
index 00000000..71368525
--- /dev/null
+++ b/object_detection/DETR/tests/test_transformer.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.:
+
+
+
+import unittest
+import paddle
+import numpy as np
+from transformer import Transformer
+from position_embedding import build_position_encoding
+from utils import NestedTensor
+
+
+class TransformerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        paddle.set_device('cpu')
+        cls.tensors = paddle.randn((4, 256, 24, 33))
+        cls.masks = paddle.ones((4, 24, 33))
+        cls.query_embed = paddle.randn((100, 256))
+        cls.pos_embed = paddle.randn((4, 256, 24, 33))
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+    @classmethod
+    def tearDown(cls):
+        pass
+
+
+
+    @unittest.skip('skip fo debug')
+    def test_position_embed(self):
+        t = TransformerTest.tensors
+        m = TransformerTest.masks
+        tensor_list = NestedTensor(t, m)
+
+        pos_embed = build_position_encoding()
+        out = pos_embed(tensor_list)
+        self.assertEqual(out.shape, [4, 256, 24, 33])
+
+
+    @unittest.skip('skip fo debug')
+    def test_transformer(self):
+        t = TransformerTest.tensors
+        m = TransformerTest.masks
+        q = TransformerTest.query_embed
+        p = TransformerTest.pos_embed
+
+        model = Transformer()
+        out = model(src=t,
+                    mask=m,
+                    query_embed=q,
+                    pos_embed=p)
+
+    @unittest.skip('skip fo debug')
+    def test_position_embed_sine(self):
+        pass
+    
+
+
diff --git a/object_detection/DETR/transformer.py b/object_detection/DETR/transformer.py
new file mode 100644
index 00000000..686670b5
--- /dev/null
+++ b/object_detection/DETR/transformer.py
@@ -0,0 +1,463 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Transformer class and related methods for DETR """
+
+import copy
+import math
+import paddle
+import paddle.nn as nn
+
+
+class Mlp(nn.Layer):
+    def __init__(self, d_model, dim_feedforward, dropout, act='relu'):
+        super(Mlp, self).__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.linear1 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr=w_attr_1,
+                                 bias_attr=b_attr_1)
+
+        self.dropout = nn.Dropout(dropout)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.linear2 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr=w_attr_2,
+                                 bias_attr=b_attr_2)
+        if act == 'relu':
+            self.act = nn.ReLU()
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.linear2(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self, d_model, n_head, dropout=0.):
+        super(Attention, self).__init__()
+        self.n_head = n_head
+        self.head_dim = int(d_model / n_head)
+        self.all_head_dim = self.head_dim * self.n_head
+        self.scales = self.head_dim ** -0.5
+
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.q = nn.Linear(d_model,
+                           self.all_head_dim,
+                           weight_attr=w_attr_1,
+                           bias_attr=b_attr_1)
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.k = nn.Linear(d_model,
+                           self.all_head_dim,
+                           weight_attr=w_attr_2,
+                           bias_attr=b_attr_2)
+        w_attr_3, b_attr_3 = self._init_weights()
+        self.v = nn.Linear(d_model,
+                           self.all_head_dim,
+                           weight_attr=w_attr_3,
+                           bias_attr=b_attr_3)
+
+        #w_attr, b_attr = self._init_weights()
+        #self.qkv = nn.Linear(d_model,
+        #                     self.all_head_dim * 3,
+        #                     weight_attr=w_attr,
+        #                     bias_attr=b_attr)
+
+
+        w_attr_4, b_attr_4 = self._init_weights()
+        self.fc = nn.Linear(self.all_head_dim,
+                            d_model,
+                            weight_attr=w_attr_4,
+                            bias_attr=b_attr_4)
+        self.attn_dropout = nn.Dropout(dropout)
+        self.dropout = nn.Dropout(dropout)
+
+        self.softmax = nn.Softmax(axis=-1)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+
+    def transpose_for_scores(self, x):
+        # [seq_l, batch, all_head_dim] -> [seq_l, batch, n_head, head_dim]
+        new_x_shape = x.shape[:-1] + [self.n_head, self.head_dim]
+        x = x.reshape(new_x_shape)
+        # [seq_l, batch, n_head, head_dim] -> [seq_l, batch*n_head, head_dim]
+        x = x.flatten(start_axis=1, stop_axis=2)
+        # [seq_l, batch*n_head, head_dim] -> [batch*n_head, seq_l, head_dim]
+        x = x.transpose([1, 0, 2])
+        return x
+
+    def forward(self, query, key, value, key_pad_mask=None):
+        SRC_L = key.shape[0] # key: [seq_l, batch_size, hidden_dim]
+        B = key.shape[1]
+        TGT_L = query.shape[0]
+        EMBED_DIM = query.shape[2]
+
+        attn_mask = None
+        if key_pad_mask is not None:
+            assert key_pad_mask.shape == [B, SRC_L], f'expecting key_pad_mask shape of {[B, L]}, but got {key_pad_mask.shape}'
+            key_pad_mask = key_pad_mask.reshape([B, 1, 1, SRC_L])
+            key_pad_mask = key_pad_mask.expand([B, self.n_head, 1, SRC_L])
+            key_pad_mask = key_pad_mask.reshape([B*self.n_head, 1, SRC_L])
+    
+            attn_mask = paddle.zeros_like(key_pad_mask)
+            inf_tensor = paddle.ones_like(key_pad_mask) * float('-inf')
+            attn_mask = paddle.where(key_pad_mask > 0.5, inf_tensor, attn_mask) # TODO: check True/False
+
+        #print('query shape:', query.shape)
+        #x = paddle.concat([query, key, value], axis=-1)
+        #print('X shape=', x.shape)
+        #qkv = self.qkv(x).chunk(3, axis=-1)
+        #q, k, v = map(self.transpose_for_scores, qkv)
+        q = self.transpose_for_scores(self.q(query))
+        k = self.transpose_for_scores(self.k(key))
+        v = self.transpose_for_scores(self.v(value))
+        #print('q.w:', self.q.weight )
+        #print('q.b:', self.q.bias )
+        #print('k.w:', self.k.weight )
+        #print('k.b:', self.k.bias )
+        #print('v.w:', self.v.weight )
+        #print('v.b:', self.v.bias )
+
+        #print('========= q before scaling ========')
+        #print(q)
+        q = q * self.scales
+        #print('========= q after scaling ========')
+        #print(q)
+        attn = paddle.matmul(q, k, transpose_y=True)
+
+        #print('attn shape=', attn.shape)
+        #attn = attn * self.scales
+        #print('============ attn =============')
+        #print(attn)
+        # add mask (-inf) to filter out pad/attn positions
+        #print('attn_mask, ', attn_mask.shape)
+        #print('attn, ', attn.shape)
+        if attn_mask is not None:
+            attn += attn_mask
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        #print('======= attn ========')
+        #print(attn)
+
+        z = paddle.matmul(attn, v)   # [batch*n_head, seq_l, head_dim]
+        #print('======== z =========')
+        #print(z)
+        z = z.transpose([1, 0, 2]) #[seq_l, batch*n_head, head_dim]
+        z = z.reshape([TGT_L, B, EMBED_DIM])
+
+        z = self.fc(z)
+        #print('========== z fc =========')
+        #print(z)
+        z = self.dropout(z)
+        return z
+        
+
+class TransformerEncoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 n_head,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 normalize_before=False):
+        super(TransformerEncoderLayer, self).__init__()
+        self.self_attn = Attention(d_model, n_head, dropout=dropout)
+        self.mlp = Mlp(d_model, dim_feedforward, dropout, activation)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        self.normalize_before = normalize_before # TODO: add pre and post
+
+    def forward_post(self,
+                     src,
+                     src_key_pad_mask=None,
+                     pos=None):
+        # + positional embedding to q and k
+        #print('src shape=', src.shape)
+        #print('pos shape=', pos.shape)
+        #print('-------- src and pos')
+        #print('src = ', src)
+        #print('pos = ', pos)
+        #print('------------------------ encoderlayer ----------')
+        q = (src + pos) if pos is not None else src
+        k = (src + pos) if pos is not None else src
+        #print('----- q and k:')
+        #print(q)
+        #print(k)
+        #print(src_key_pad_mask)
+        # attention
+        src2 = self.self_attn(query=q, key=k, value=src, key_pad_mask=src_key_pad_mask)
+
+        #print('----- src2:')
+
+        # attention add & norm
+        src = src + src2
+        #print('==== src before norm1')
+        #print(src)
+        src = self.norm1(src)
+        #print('==== src after norm1')
+        #print(src)
+        # FFN
+        src2 = self.mlp(src)
+        #print('===== src2 ')
+        #print(src2)
+        # FFN add & norm
+        src = src + src2
+        src = self.norm2(src)
+        return src
+
+    def forward(self, src, src_key_pad_mask=None, pos=None):
+        return self.forward_post(src, src_key_pad_mask, pos)
+        
+
+class TransformerEncoder(nn.Layer):
+    def __init__(self, layer, num_layers, norm):
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.LayerList([copy.deepcopy(layer) for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_key_pad_mask=None, pos=None):
+        output = src
+        for idx, layer in enumerate(self.layers):
+            #print(f'---------- encoder {idx} ------------')
+            output = layer(output, src_key_pad_mask=src_key_pad_mask, pos=pos)
+            #print(output, output.shape)
+        if self.norm is not None:
+            output = self.norm(output)
+        #print(f'---------- last encoder after norm ------------')
+        #print(output, output.shape)
+        return output
+
+
+class TransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 n_head,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 normalize_before=False):
+        super(TransformerDecoderLayer, self).__init__()
+        self.self_attn = Attention(d_model, n_head, dropout=dropout)
+        self.dec_enc_attn = Attention(d_model, n_head, dropout=dropout)
+        self.mlp = Mlp(d_model, dim_feedforward, dropout, activation)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+
+        self.normalize_before = normalize_before #TODO: add forward_pre and post
+
+    def forward_post(self,
+                     tgt,
+                     memory,
+                     memory_key_pad_mask=None,
+                     pos=None,
+                     query_pos=None):
+        # + positional embedding to q and k
+        q = (tgt + query_pos) if query_pos is not None else tgt
+        k = (tgt + query_pos) if query_pos is not None else tgt
+        # dec self attention
+        #print('----- decoder self_attn: ')
+        tgt2 = self.self_attn(query=q,
+                              key=k,
+                              value=tgt) 
+        #print('================================')
+        #print('===========dec self_attn =================')
+        #print('================================')
+        #print(tgt2)
+
+        # dec self attention add & norm
+        tgt = tgt + tgt2
+        tgt = self.norm1(tgt)
+        # dec enc attention 
+        tgt2 = self.dec_enc_attn(query=(tgt + query_pos) if query_pos is not None else tgt,
+                                 key=(memory + pos) if pos is not None else memory,
+                                 value=memory,
+                                 key_pad_mask=memory_key_pad_mask)
+
+        #print('================================')
+        #print('===========dec dec_enc_attn==================')
+        #print('================================')
+        #print(tgt2)
+
+        # dec enc attention add & norm
+        tgt = tgt + tgt2
+        tgt = self.norm2(tgt)
+        # FFN
+        tgt2 = self.mlp(tgt)
+        # FFN add & norm
+        tgt = tgt + tgt2
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward(self, tgt, memory, memory_key_pad_mask=None, pos=None, query_pos=None):
+        return self.forward_post(tgt, memory, memory_key_pad_mask, pos, query_pos)
+
+
+class TransformerDecoder(nn.Layer):
+    def __init__(self, layer, num_layers, norm=None, return_intermediate=False):
+        super(TransformerDecoder, self).__init__()
+        self.layers = nn.LayerList([copy.deepcopy(layer) for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                target,
+                memory,
+                memory_key_pad_mask=None,
+                pos=None,
+                query_pos=None):
+        output = target
+        intermediate = []
+
+        for idx, layer in enumerate(self.layers):
+            #print(f'---------- decoder {idx} ------------')
+            output = layer(output,
+                           memory,
+                           memory_key_pad_mask=memory_key_pad_mask,
+                           pos=pos,
+                           query_pos=query_pos)
+            #print(output, output.shape)
+            if self.return_intermediate:
+                #print(output, output.shape)
+                #print(self.norm.weight)
+                #print(self.norm.bias)
+                #print('-------------- before and after norm --------------')
+                #print(self.norm(output), self.norm(output).shape)
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+        
+        if self.return_intermediate:
+            #print('!!!!!!!!!!!!!!!!!!!')
+            #print(intermediate)
+            return paddle.stack(intermediate)
+
+        #print('!!!!!!!!!!!!!!!!!!!')
+        #print(output, output.shape)
+        return output.unsqueeze(0)
+
+
+class Transformer(nn.Layer):
+    def __init__(self,
+                d_model=256,
+                n_head=8,
+                num_encoder_layers=6,
+                num_decoder_layers=6,
+                dim_feedforward=2048,
+                dropout=0.1,
+                activation='relu',
+                normalize_before=False,
+                return_intermediate_dec=True):
+        super(Transformer, self).__init__()
+        encoder_layer = TransformerEncoderLayer(d_model,
+                                                n_head,
+                                                dim_feedforward,
+                                                dropout,
+                                                activation,
+                                                normalize_before)
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer,
+                                          num_encoder_layers,
+                                          encoder_norm)
+
+        decoder_layer = TransformerDecoderLayer(d_model,
+                                                n_head,
+                                                dim_feedforward,
+                                                dropout,
+                                                activation,
+                                                normalize_before)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer,
+                                          num_decoder_layers,
+                                          decoder_norm,
+                                          return_intermediate_dec)
+        
+        self._reset_params()
+
+        self.n_head = n_head
+        self.d_model = d_model
+
+    def _reset_params(self):
+        pass
+
+    def forward(self, src, mask, query_embed, pos_embed):
+        B, C, H, W = src.shape
+        src = src.flatten(2) # [B, C, H, W] -> [B, C, H*W]
+        src = src.transpose([2, 0, 1]) # [B, C, H*W] -> [H*W, B, C]
+        pos_embed = pos_embed.flatten(2) # [B, dim, H, W] -> [B, dim, H*W]
+        pos_embed = pos_embed.transpose([2, 0, 1]) # [B, dim, H*W] -> [H*W, B, dim]
+        query_embed = query_embed.unsqueeze(1) #[num_queries, 1, d_model]
+        query_embed = query_embed.expand((query_embed.shape[0], B, query_embed.shape[2]))
+        mask = mask.flatten(1) # this mask is batch mask for multiple image sizes
+
+        target = paddle.zeros_like(query_embed) # decoder 1st input is set to all zeros
+
+
+        #print('----- inside transformer')
+        #print(src.shape)
+        #print(pos_embed.shape)
+        #print(query_embed.shape)
+        #print(mask.shape)
+        #print('-----')
+
+        memory = self.encoder(src, src_key_pad_mask=mask, pos=pos_embed)
+
+        #print('||||||||||||||| memory |||||||||||||')
+        #print(memory, memory.shape)
+
+        hs = self.decoder(target,
+                          memory,
+                          memory_key_pad_mask=mask,
+                          pos=pos_embed,
+                          query_pos=query_embed)
+        
+        #print('hs shape:', hs.shape)
+        #print(hs)
+        hs = hs.transpose([0, 2, 1, 3])  # [1, batch, n_queries, embed_dim]
+        memory = memory.transpose([1, 2, 0])
+        memory = memory.reshape([B, C, H, W])
+
+        return hs, memory
+        
+
+def build_transformer(config):
+    return Transformer(d_model=config.MODEL.TRANS.HIDDEN_SIZE,
+                       n_head=config.MODEL.TRANS.NUM_HEADS,
+                       num_encoder_layers=config.MODEL.TRANS.NUM_ENCODER_LAYERS,
+                       num_decoder_layers=config.MODEL.TRANS.NUM_DECODER_LAYERS,
+                       dim_feedforward=config.MODEL.TRANS.MLP_DIM,
+                       dropout=config.MODEL.DROPOUT,
+                       activation='relu',
+                       normalize_before=False,
+                       return_intermediate_dec=True)
+
diff --git a/object_detection/DETR/transforms.py b/object_detection/DETR/transforms.py
new file mode 100644
index 00000000..9bf99194
--- /dev/null
+++ b/object_detection/DETR/transforms.py
@@ -0,0 +1,357 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Transforms for image data and detection targets"""
+
+import random
+import numpy as np
+import PIL
+import paddle
+import paddle.vision.transforms as T
+from paddle.vision.transforms import functional as F
+from random_erasing import RandomErasing
+from box_ops import box_xyxy_to_cxcywh
+
+
+def crop(image, target, region):
+    cropped_image = T.crop(image, *region)
+    target = target.copy()
+    i, j, h, w = region
+    target['size'] = paddle.to_tensor([h, w])
+    fields = ['labels', 'area', 'iscrowd']
+
+    if 'boxes' in target:
+        boxes = target['boxes']
+        max_size = paddle.to_tensor([h, w], dtype='float32')
+        cropped_boxes = boxes - paddle.to_tensor([j, i, j, i], dtype='float32') # box are (x1, y1, x2, y2)
+        cropped_boxes = paddle.minimum(cropped_boxes.reshape([-1, 2, 2]), max_size)
+        cropped_boxes = cropped_boxes.clip(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(axis=1)
+        target['boxes'] = cropped_boxes.reshape([-1, 4])
+        target['area'] = area
+        fields.append('boxes')
+
+    if 'masks' in target:
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append('masks')
+
+
+    # remove the boxe or mask if the area is zero
+    if 'boxes' in target or 'masks' in target:
+        if 'boxes' in target:
+            cropped_boxes = target['boxes'].reshape((-1, 2, 2))
+            # FIXME: select indices where x2 > x1 and y2 > y1
+            # This paddle api will raise error in current env
+            #keep = paddle.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], axis=1)
+            # Instead we use numpy for temp fix
+            cropped_boxes = cropped_boxes.cpu().numpy()
+            keep  = np.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], axis=1)
+            #keep = keep.cpu().numpy()
+        else:
+            keep = target['masks'].flatten(1).any(1)
+            keep = keep.cpu().numpy()
+
+        keep_idx = np.where(keep)[0].astype('int32')
+        keep = paddle.to_tensor(keep_idx)
+
+        for field in fields:
+            target[field] = target[field].index_select(keep, axis=0)
+
+    return cropped_image, target
+
+
+def hflip(image, target):
+    flipped_image = T.hflip(image)
+
+    w, h = image.size
+
+    target = target.copy()
+    if 'boxes' in target:
+        boxes = target['boxes'] # n x 4
+        boxes = boxes.index_select(paddle.to_tensor([2, 1, 0, 3], dtype='int32'), axis=1)
+        boxes = boxes * paddle.to_tensor(
+                [-1, 1, -1, 1], dtype='float32') + paddle.to_tensor([w, 0, w, 0], dtype='float32')
+        target['boxes'] = boxes
+
+    if 'masks' in target:
+        target['masks'] = (target['masks']).flip(axis=[-1])
+
+    return flipped_image, target
+
+
+def resize(image, target, size, max_size=None):
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        """ get new image size for rescale, aspect ratio is kept, and longer side must < max_size
+        Args:
+            image_size: tuple/list of image width and height
+            size: length of shorter side of scaled image
+            max_size: max length of longer side of scaled image
+        Returns:
+            size: output image size in (h, w) order.
+        """
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min(w, h))
+            max_original_size = float(max(w, h))
+            # size is shorter side and keep the aspect ratio, if the longer side
+            # is larger than the max_size
+            if max_original_size / min_original_size * size > max_size:
+                # longer side is the max_size, shorter side size is:
+                size = int(round(max_size * min_original_size / max_original_size))
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+        
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        """"get new image size to rescale
+        Args:
+            image_size: tuple, Pillow image size, (width, height)
+            size: int or list/tuple, if size is list or tuple, return
+            this size as the new image size to rescale, if size is a
+            single int, then compute the new image size by this size
+            (as shorter side) and max_size (as longer side), also keep
+            the same aspect_ratio as original image.
+            max_size: longest side max size of new image size
+        Return:
+            size: tuple, (width, height)
+        """
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    # STEP0: get new image size
+    size = get_size(image.size, size, max_size)
+    # STEP1: resize image with new size
+    rescaled_image = T.resize(image, size) # here size is (h, w)
+    # STEP2: resize targets
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if 'boxes' in target:
+        boxes = target['boxes']
+        if boxes.shape[0] == 0: # empty boxes
+            scaled_boxes = boxes
+        else: # this line works well in pytorch, but not in paddle
+            scaled_boxes = boxes * paddle.to_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target['boxes'] = scaled_boxes
+
+    if 'area' in target:
+        area = target['area']
+        scaled_area = area * (ratio_width * ratio_height)
+        target['area'] = scaled_area
+
+    h, w = size
+    target['size'] = paddle.to_tensor([h, w])
+
+    if 'masks' in target:
+        masks = target['masks'] # [N, H, W]
+        masks = masks.unsqueeze(-1).astype('float32') #[N, H, W, 1]
+        masks = paddle.nn.functional.interpolate(
+                    masks, size, data_format='NHWC')  #[N, H', W', 1]
+        masks = masks[:, :, :, 0] > 0.5
+        masks = masks.astype('int32')
+        target['masks'] = masks
+
+    return rescaled_image, target
+
+
+def pad(image, target, padding):
+    padded_image = T.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    target['size'] = paddle.to_tensor(padded_image.size[::-1])
+    if 'masks' in target:
+        target['masks'] = T.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+
+
+class RandomCrop():
+    def __init__(self, size):
+        self.size = size
+    
+    @staticmethod
+    def get_param(image, output_size):
+        def _get_image_size(img):
+            if F._is_pil_image(img):
+                return img.size
+            elif F._is_numpy_image(img):
+                return img.shape[:2][::-1]
+            elif F._is_tensor_image(img):
+                return img.shape[1:][::-1]  # chw
+            else:
+                raise TypeError("Unexpected type {}".format(type(img)))
+
+        w, h = _get_image_size(image)
+        th, tw = output_size
+        if w == tw and h == th:
+            return 0, 0, h, w
+
+        i = random.randint(0, h - th)
+        j = random.randint(0, w - tw)
+        return i, j, th, tw
+
+    def __call__(self, image, target):
+        region = RandomCrop.get_param(image, self.size)
+        return crop(image, target, region)
+
+
+class RandomSizeCrop():
+    def __init__(self, min_size, max_size):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def __call__(self, image, target):
+        w = random.randint(self.min_size, min(image.width, self.max_size))
+        h = random.randint(self.min_size, min(image.height, self.max_size))
+        region = RandomCrop.get_param(image, (h, w))
+        return crop(image, target, region)
+
+
+class CenterCrop():
+    def __init__(self, size):
+        self.size = size
+    
+    def __call__(self, image, target):
+        image_width, image_height = image.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.)) 
+        crop_left = int(round((image_width - crop_width) / 2.)) 
+        return crop(image, target, (crop_top, crop_left, crop_height, crop_width))
+
+
+class RandomHorizontalFlip():
+    def __init__(self, p=0.5):
+        self.p = p
+    
+    def __call__(self, image, target):
+        if random.random() < self.p:
+            return hflip(image, target)
+        return image, target
+
+
+class RandomResize():
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple)) 
+        self.sizes = sizes
+        self.max_size = max_size
+
+    def __call__(self, image, target=None):
+        size = random.choice(self.sizes)
+        return resize(image, target, size, self.max_size)
+
+
+class RandomPad():
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+
+    def __call__(self, image, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(image, target, (pad_x, pad_y))
+
+
+class RandomSelect():
+    """ Random select one the transforms to apply with probablity p"""
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+        
+    def __call__(self, image, target):
+        if random.random() > self.p:
+            return self.transforms1(image, target)
+        return self.transforms2(image, target)
+
+
+class ToTensor():
+    def __call__(self, image, target):
+        return T.to_tensor(image), target
+
+
+class RandomErasing():
+    def __init__(self, *args, **kwargs):
+        self.eraser = RandomErasing(*args, **kwargs) 
+
+    def __call__(self, image, target):
+        return self.eraser(image), target
+
+
+class Normalize():
+    """Normalization for image and labels.
+
+    Specifically, image is normalized with -mean and /std,
+    boxes are converted to [cx, cy, w, h] format and scaled to 
+    [0, 1] according to image size
+    """
+
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, image, target=None):
+        image = T.functional.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if 'boxes' in target and target['boxes'].shape[0] != 0:
+            boxes = target['boxes']
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / paddle.to_tensor([w, h, w, h], dtype='float32')
+            target['boxes'] = boxes
+        return image, target
+
+
+class Compose():
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
+
+
+
+        
+
+
+
+
+
+
+
+
diff --git a/object_detection/DETR/utils.py b/object_detection/DETR/utils.py
new file mode 100644
index 00000000..9304c319
--- /dev/null
+++ b/object_detection/DETR/utils.py
@@ -0,0 +1,243 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities"""
+
+import copy
+import pickle
+import numpy as np
+import paddle
+import paddle.distributed as dist
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+def collate_fn(batch):
+    """Collate function for batching samples
+    
+    Samples varies in sizes, here convert samples to NestedTensor which pads the tensor,
+    and generate the corresponding mask, so that the whole batch is of the same size.
+
+    """
+    # eliminate invalid data (where boxes is [] tensor)
+    old_batch_len = len(batch)
+    batch = [x for x in batch if x[1]['boxes'].shape[0] != 0]
+    # try refill empty sample by other sample in current batch
+    #print('batch len = ', old_batch_len)
+    #print('new batch len = ', len(batch))
+    new_batch_len = len(batch)
+    for i in range(new_batch_len, old_batch_len):
+        batch.append(copy.deepcopy(batch[i%new_batch_len]))
+    #print('batch = ', batch)
+    #print('filled batch len = ', len(batch))
+    batch = list(zip(*batch)) # batch[0]: data tensor, batch[1]: targets dict
+
+    batch[0] = nested_tensor_from_tensor_list(batch[0])
+    return tuple(batch)
+
+
+def _max_by_axis(the_list):
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for idx, item in enumerate(sublist):
+            maxes[idx] = max(maxes[idx], item)
+    return maxes
+
+
+class NestedTensor():
+    """Each NestedTensor has .tensor and .mask attributes, which are paddle.Tensors"""
+    def __init__(self, tensors, mask):
+        self.tensors = tensors
+        self.mask = mask
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list):
+    """make the batch handle different image sizes
+    
+    This method take a list of tensors with different sizes,
+    then max size is selected as the final batch size,
+    smaller samples are padded with zeros(bottom-right),
+    and corresponding masks are generated.
+
+    """
+    max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+    batch_shape = [len(tensor_list)] + max_size # len is the num of images in this batch
+    b, c, h, w  = batch_shape
+    dtype = tensor_list[0].dtype
+    data_tensor = paddle.zeros(batch_shape, dtype=dtype)
+    mask = paddle.ones((b, h, w), dtype='int32')
+    # zip has broadcast for tensor and mask
+    #print('===== inside nested_tensor_from_tensor_list')
+    # zip cannot used in paddle, which will create a new tensor. in pytorch it works well
+    #for img, pad_img, m in zip(tensor_list, tensor, mask):
+    #    pad_img[: img.shape[0], : img.shape[1], : img.shape[2]] = img
+    #    m[: img.shape[0], :img.shape[1]] = 0
+    for idx in range(b):
+        s0 = tensor_list[idx].shape[0]
+        s1 = tensor_list[idx].shape[1]
+        s2 = tensor_list[idx].shape[2]
+        # direct set value raise error in current env, we use numpy to bypass
+        data_tensor[idx, : s0, : s1, : s2] = tensor_list[idx].cpu().numpy()
+        #data_tensor[idx, : s0, : s1, : s2] = tensor_list[idx]
+        mask[idx, : s1, : s2] = 0
+    return NestedTensor(data_tensor, mask)
+
+
+def reduce_dict(input_dict, average=True):
+    """Impl all_reduce for dict of tensors in DDP"""
+    world_size = dist.get_world_size()
+    if world_size < 2:
+        return input_dict
+    with paddle.no_grad():
+        names = []
+        values = []
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = paddle.stack(values, axis=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+@paddle.no_grad()
+def accuracy(output, target, topk=(1,)):
+    if target.numel() == 0:
+        return [paddle.zeros([])]
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.reshape(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].reshape(-1).astype('float32').sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val
+
+
+def all_gather(data):
+    """ run all_gather on any picklable data (do not requires tensors)
+    Args:
+        data: picklable object
+    Returns:
+        data_list: list of data gathered from each rank
+    """
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return [data]
+
+    buffer = pickle.dumps(data) #write data into Bytes and stores in buffer
+    np_buffer = np.frombuffer(buffer, dtype=np.int8)
+    tensor = paddle.to_tensor(np_buffer, dtype='int32') # uint8 doese not have many ops in paddle
+
+    # obtain Tensor size of each rank
+    local_size = paddle.to_tensor([tensor.shape[0]])
+    size_list = []
+    dist.all_gather(size_list, local_size)
+    max_size = max(size_list)
+
+    # receiving tensors from all ranks, 
+    # all_gather does not support different shape, so we use padding
+    tensor_list = []
+    if local_size != max_size:
+        padding = paddle.empty(shape=(max_size - local_size, ), dtype='int32')
+        tensor = paddle.concat((tensor, padding), axis=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.astype('uint8').cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
diff --git a/semantic_segmentation/README.md b/semantic_segmentation/README.md
new file mode 100644
index 00000000..a54bd5d2
--- /dev/null
+++ b/semantic_segmentation/README.md
@@ -0,0 +1,177 @@
+
+# Semantic segmentation toolkit based on Visual Transformers
+
+Semantic segmentation aims at classifying each pixel in an image to a specified semantic category, including objects (e.g., bicycle, car, people) and stuff (e.g., road, bench, sky).
+<div align="center">
+  <img src="figure/ppvit_seg.png" width="700px" />
+</div>
+
+## Environment
+This code is developed under the following configurations:
+
+Hardware: 1/2/4/8 GPU for training and testing
+Software: Centos 6.10, CUDA=10.2 Python=3.8, Paddle=2.1.0
+
+## Installation
+1. Create a conda virtual environment and activate it.
+
+```shell
+conda create -n paddlevit python=3.8
+conda activate ppvit
+```
+
+2. Install PaddlePaddle following the official instructions, e.g.,
+```shell
+conda install paddlepaddle-gpu==2.1.0 cudatoolkit=10.2 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/Paddle/
+```
+
+3. Install PaddleViT
+```shell
+git clone https://github.com/BR-IDL/PaddleViT.git
+cd PaddleViT/semantic_segmentation
+pip3 install -r requirements.txt
+```
+
+## Demo
+We provide a demo script [demo.py](./demo/demo.py). This script performs inference on single images. You can put the input images in `./demo/img`.
+```shell
+cd demo
+CUDA_VISIBLE_DEVICES=0 python3 demo.py \
+    --config ${CONFIG_FILE} \
+    --model_path ${MODEL_PATH} \
+    --pretrained_backbone ${PRETRAINED_BACKBONE} \
+    --img_dir ${IMAGE_DIRECTORY} \
+    --results_dir ${RESULT_DIRECTRORY}
+```
+Examples:
+```shell
+cd demo
+CUDA_VISIBLE_DEVICES=0 python3 demo.py \
+    --config ../configs/setr/SETR_PUP_Large_768x768_80k_cityscapes_bs_8.yaml \
+    --model_path ../pretrain_models/setr/SETR_PUP_cityscapes_b8_80k.pdparams \
+    --pretrained_backbone ../pretrain_models/backbones/vit_large_patch16_224.pdparams \
+    --img_dir ./img/ \
+    --results_dir ./results/
+```
+
+
+## Quick start: training and testing models
+
+### 1. Preparing data
+#### Pascal-Context dataset
+Download Pascal-Context dataset. "pascal_context/SegmentationClassContext" is generated by running the script [voc2010_to_pascalcontext.py](tools/voc2010_to_pascalcontext.py).
+Specifically, downloading the PASCAL VOC2010 from http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar, and annotation file from https://codalabuser.blob.core.windows.net/public/trainval_merged.json. It should have this basic structure:  
+```
+pascal_context
+|-- Annotations
+|-- ImageSets
+|-- JPEGImages
+|-- SegmentationClass
+|-- SegmentationClassContext
+|-- SegmentationObject
+|-- trainval_merged.json
+|-- voc2010_to_pascalcontext.py
+```
+#### ADE20K dataset
+Download ADE20K dataset from http://sceneparsing.csail.mit.edu/.  It should have this basic structure: 
+```
+ADEChallengeData2016
+|-- annotations
+|   |-- training
+|   `-- validation
+|-- images
+|   |-- training
+|   `-- validation
+|-- objectInfo150.txt
+`-- sceneCategories.txt
+```
+### Cityscapes dataset
+Download Cityscapes dataset from https://www.cityscapes-dataset.com/. **labelTrainIds.png are used for cityscapes training, which are generated by the script [convert_cityscapes.py](tools/convert_cityscapes.py). It should have this basic structure:
+```
+cityscapes
+|-- gtFine
+|   |-- test
+|   |-- train
+|   `-- val
+|-- leftImg8bit
+|   |-- test
+|   |-- train
+|   `-- val
+```
+### Trans10kV2 dataset
+Download Trans10kV2 dataset from [Google Drive](https://drive.google.com/file/d/1YzAAMY8xfL9BMTIDU-nFC3dcGbSIBPu5/view?usp=sharing). or
+[Baidu Drive](https://pan.baidu.com/s/1P-2l-Q2brbnwRd2kXi--Dg). code: oqms
+. It should have this basic structure:
+```
+Trans10K_cls12
+|-- test
+|   |-- images
+|   `-- masks_12
+|-- train
+|   |-- images
+|   `-- masks_12
+|-- validation
+|   |-- images
+|   `-- masks_12
+```
+
+### 2. Testing
+#### Single-scale testing on single GPU
+```shell
+CUDA_VISIBLE_DEVICES=0 python3  val.py  \
+    --config ./configs/SETR/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml \
+    --model_path ./pretrain_models/setr/SETR_MLA_pascal_context_b8_80k.pdparams
+```
+
+#### Multi-scale testing on single GPU
+```shell
+CUDA_VISIBLE_DEVICES=0,1 python3 val.py \
+    --config ./configs/SETR/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml \
+    --model_path ./pretrain_models/setr/SETR_MLA_pascal_context_b8_80k.pdparams \
+    --multi_scales True
+```
+
+#### Single-scale testing on multi GPU
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -u -m paddle.distributed.launch val.py \
+    --config ./configs/SETR/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml \
+    --model_path ./pretrain_models/setr/SETR_MLA_pascal_context_b8_80k.pdparams
+```
+
+#### Multi-scale testing on multi GPU
+```shell                                                                                                                                                                                       
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -u -m paddle.distributed.launch val.py \
+    --config ./configs/SETR/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml \
+    --model_path ./pretrain_models/setr/SETR_MLA_pascal_context_b8_80k.pdparams \
+    --multi_scales True
+```
+
+> Note:
+>
+> - that the `-model_path` option accepts the path of pretrained weights file (segmentation model, e.g., setr)
+
+
+### 3. Training
+#### Training on single GPU
+
+```shell
+CUDA_VISIBLE_DEVICES=0 python3  train.py \
+    --config ./configs/SETR/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml
+```
+> Note:
+> - The training options such as lr, image size, model layers, etc., can be changed in the `.yaml` file set in `-cfg`. All the available settings can be found in `./config.py`
+
+#### Training on multi GPU
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -u -m paddle.distributed.launch train.py \
+    --config ./configs/SETR/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml
+
+```
+> Note:
+>
+> - The training options such as lr, image size, model layers, etc., can be changed in the `.yaml` file set in `-cfg`. All the available settings can be found in `./config.py`
+
+
+## Contact
+If you have any questions regarding this repo, please create an issue.
diff --git a/semantic_segmentation/config.py b/semantic_segmentation/config.py
new file mode 100644
index 00000000..753dcb38
--- /dev/null
+++ b/semantic_segmentation/config.py
@@ -0,0 +1,220 @@
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 4  #train batch_size for single GPU
+_C.DATA.BATCH_SIZE_VAL = 1 # val batch_size for single GPU
+_C.DATA.DATASET = 'PascalContext' # dataset name
+_C.DATA.DATA_PATH = '/home/ssd3/wutianyi/datasets/pascal_context'
+_C.DATA.CROP_SIZE = (480,480) # input_size (training)
+_C.DATA.NUM_CLASSES = 60  # 19 for cityscapes, 60 for Pascal-Context
+_C.DATA.NUM_WORKERS = 0 # number of data loading threads (curren paddle must set to 0)
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.NAME = 'SETR_MLA'
+_C.MODEL.ENCODER = CN()
+_C.MODEL.ENCODER.TYPE = 'ViT_MLA'
+_C.MODEL.ENCODER.OUT_INDICES = [5,11,17,23]
+_C.MODEL.ENCODER.MULTI_GRID = False # Trans2seg cnn encoder setting
+_C.MODEL.ENCODER.MULTI_DILATION = None # Trans2seg cnn encoder setting
+
+_C.MODEL.DECODER_TYPE = 'ViT_MLAHead'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.0 # 0.0
+_C.MODEL.ATTENTION_DROPOUT = 0.0
+_C.MODEL.DROP_PATH = 0.1  # for SwinTransformer
+_C.MODEL.OUTPUT_STRIDE = 16
+_C.MODEL.BACKBONE_SCALE = 1.0
+
+# Transformer backbone settings 
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.HYBRID = False       #TODO: implement
+_C.MODEL.TRANS.PATCH_GRID = None    #TODO: implement
+_C.MODEL.TRANS.PATCH_SIZE = None    # 16
+_C.MODEL.TRANS.HIDDEN_SIZE = 768  # 768(Base), 1024(Large), 1280(Huge)
+_C.MODEL.TRANS.MLP_RATIO = 4
+_C.MODEL.TRANS.NUM_HEADS = None      # 12(Base), 16(Large), 16(Huge)
+_C.MODEL.TRANS.NUM_LAYERS = None     # 12(Base), 24(Large), 32(Huge)
+_C.MODEL.TRANS.QKV_BIAS = True
+
+## special settings for SwinTransformer
+_C.MODEL.TRANS.WINDOW_SIZE = 7
+_C.MODEL.TRANS.IN_CHANNELS = 3
+_C.MODEL.TRANS.EMBED_DIM = 96  # same as HIDDEN_SIZE
+_C.MODEL.TRANS.STAGE_DEPTHS = [2, 2, 6, 2]
+_C.MODEL.TRANS.NUM_HEADS = None     # [3, 6, 12, 24]
+_C.MODEL.TRANS.QK_SCALE = None
+_C.MODEL.TRANS.APE = False   # absolute postional embedding
+_C.MODEL.TRANS.PATCH_NORM = True   
+#_C.MODEL.TRANS.DROP_PATH_RATE = None   
+_C.MODEL.TRANS.KEEP_CLS_TOKEN = False
+
+## special settings for Segformer
+_C.MODEL.TRANS.NUM_STAGES = 4
+_C.MODEL.TRANS.STRIDES = [4, 2, 2, 2]
+_C.MODEL.TRANS.SR_RATIOS = [8, 4, 2, 1]
+
+# MLA Decoder setting
+_C.MODEL.MLA = CN()
+#_C.MODEL.MLA.MLA_INDEX = [2, 5, 8, 11]   # Base: [2, 5, 8, 11]; Large: [5, 11, 17, 23] 
+_C.MODEL.MLA.MLA_CHANNELS = 256
+_C.MODEL.MLA.MLAHEAD_CHANNELS=128
+_C.MODEL.MLA.AUXIHEAD = False
+_C.MODEL.MLA.MLAHEAD_ALIGN_CORNERS = False
+
+
+# PUP and Naive Decoder setting
+_C.MODEL.PUP = CN()
+_C.MODEL.PUP.INPUT_CHANNEL = 1024
+_C.MODEL.PUP.NUM_CONV = 4
+_C.MODEL.PUP.NUM_UPSAMPLE_LAYER = 4
+_C.MODEL.PUP.CONV3x3_CONV1x1 = True
+_C.MODEL.PUP.ALIGN_CORNERS = False
+
+# Auxi PUP and Naive Decoder setting
+_C.MODEL.AUXPUP = CN()
+_C.MODEL.AUXPUP.INPUT_CHANNEL = 1024
+_C.MODEL.AUXPUP.NUM_CONV = 2
+_C.MODEL.AUXPUP.NUM_UPSAMPLE_LAYER = 2
+_C.MODEL.AUXPUP.CONV3x3_CONV1x1 = True
+_C.MODEL.AUXPUP.ALIGN_CORNERS = False
+
+# UperHead Decoder setting
+_C.MODEL.UPERHEAD = CN()
+_C.MODEL.UPERHEAD.IN_CHANNELS = [96, 192, 384, 768]
+_C.MODEL.UPERHEAD.CHANNELS = 512
+_C.MODEL.UPERHEAD.IN_INDEX = [0, 1, 2, 3]
+_C.MODEL.UPERHEAD.POOL_SCALES = [1, 2, 3, 6]
+_C.MODEL.UPERHEAD.DROP_RATIO = 0.1
+_C.MODEL.UPERHEAD.ALIGN_CORNERS = False
+
+# Auxilary Segmentation Head setting
+_C.MODEL.AUX = CN()
+_C.MODEL.AUX.AUXIHEAD = True
+_C.MODEL.AUX.AUXHEAD_ALIGN_CORNERS = False
+
+# Auxilary FCN Head
+_C.MODEL.AUXFCN = CN()
+_C.MODEL.AUXFCN.IN_CHANNELS = 384
+_C.MODEL.AUXFCN.UP_RATIO = 16
+
+#DPT Head settings
+_C.MODEL.DPT = CN()
+_C.MODEL.DPT.HIDDEN_FEATURES = [256, 512, 1024, 1024]
+_C.MODEL.DPT.FEATURES = 256
+_C.MODEL.DPT.READOUT_PROCESS = "project"
+
+#Segmenter Head Settings
+_C.MODEL.SEGMENTER = CN()
+_C.MODEL.SEGMENTER.NUM_LAYERS = 2
+
+#Segformer Head Settings
+_C.MODEL.SEGFORMER = CN()
+_C.MODEL.SEGFORMER.IN_CHANNELS = [32, 64, 160, 256]
+_C.MODEL.SEGFORMER.CHANNELS = 256
+_C.MODEL.SEGFORMER.ALIGN_CORNERS = False
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.USE_GPU = True
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.END_LR = 1e-4
+_C.TRAIN.DECODER_LR_COEF = 1.0
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ITERS = 80000
+_C.TRAIN.WEIGHT_DECAY = 0.0 # 0.0 for finetune
+_C.TRAIN.POWER=0.9
+_C.TRAIN.DECAY_STEPS= 80000
+_C.TRAIN.APEX = False
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'PolynomialDecay'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.POWER = 0.9 # only used in PolynomialDecay
+_C.TRAIN.LR_SCHEDULER.GAMMA = 0.1
+_C.TRAIN.LR_SCHEDULER.OHEM = False # whether to use ohem
+_C.TRAIN.LR_SCHEDULER.AUX = False # whether to use aux loss
+_C.TRAIN.LR_SCHEDULER.AUX_WEIGHT = 0.4 # aux loss weight
+_C.TRAIN.LR_SCHEDULER.LOSS_NAME = '' # loss name
+_C.TRAIN.LR_SCHEDULER.DECODER_LR_FACTOR = 10.0 # decoder lr x10
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'SGD'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# Trans2Seg settings
+_C.MODEL.TRANS2SEG = CN()
+_C.MODEL.TRANS2SEG.EMBED_DIM = 256
+_C.MODEL.TRANS2SEG.DEPTH = 4
+_C.MODEL.TRANS2SEG.NUM_HEADS = 8
+_C.MODEL.TRANS2SEG.MLP_RATIO = 3.
+_C.MODEL.TRANS2SEG.HID_DIM = 64
+
+# val settings
+_C.VAL = CN()
+_C.VAL.USE_GPU = True
+_C.VAL.MULTI_SCALES_VAL = False
+_C.VAL.SCALE_RATIOS= [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+_C.VAL.IMAGE_BASE_SIZE = None # 520 for pascal context
+_C.VAL.KEEP_ORI_SIZE = False
+_C.VAL.RESCALE_FROM_ORI = False
+_C.VAL.CROP_SIZE = [480,480]
+_C.VAL.STRIDE_SIZE = [320,320]
+_C.VAL.MEAN = [123.675, 116.28, 103.53]
+_C.VAL.STD = [58.395, 57.12, 57.375]
+
+# misc
+_C.SAVE_DIR = "./output"
+_C.KEEP_CHECKPOINT_MAX = 3
+_C.TAG = "default"
+_C.SAVE_FREQ_CHECKPOINT = 1000 # freq to save chpt
+_C.LOGGING_INFO_FREQ = 50 # freq to logging info
+_C.VALIDATE_FREQ = 2000 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.LOCAL_RANK = 0
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if "pretrained_backbone" in args:
+        config.MODEL.PRETRAINED = args.pretrained_backbone
+    #config.freeze()
+    return config
+
+def get_config():
+    config = _C.clone()
+    return config
diff --git a/semantic_segmentation/configs/dpt/DPT_Large_480x480_160k_ade20k_bs_16.yaml b/semantic_segmentation/configs/dpt/DPT_Large_480x480_160k_ade20k_bs_16.yaml
new file mode 100644
index 00000000..a5c4a3fe
--- /dev/null
+++ b/semantic_segmentation/configs/dpt/DPT_Large_480x480_160k_ade20k_bs_16.yaml
@@ -0,0 +1,53 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/ADEChallengeData2016'
+    CROP_SIZE: (480,480)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'DPT'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [5, 11, 17, 23] 
+    PRETRAINED: None
+    DECODER_TYPE: 'DPTHead'
+    DPT:
+        HIDDEN_FEATURES: [256, 512, 1024, 1024]
+        FEATURES: 256
+        READOUT_PROCESS: 'project'
+    NUM_CLASSES: 150
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+        KEEP_CLS_TOKEN: True
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0]
+    IMAGE_BASE_SIZE: 520
+    CROP_SIZE: [480,480]
+    STRIDE_SIZE: [320,320]
+    MEAN: [127.5, 127.5, 127.5]
+    STD: [127.5, 127.5, 127.5]
+SAVE_DIR: "./output/DPT_Large_480x480_160k_ade20k_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/dpt/README.md b/semantic_segmentation/configs/dpt/README.md
new file mode 100644
index 00000000..849b96eb
--- /dev/null
+++ b/semantic_segmentation/configs/dpt/README.md
@@ -0,0 +1,21 @@
+# Vision Transformers for Dense Prediction, [arxiv](https://arxiv.org/pdf/2103.13413.pdf) 
+
+The official pytorch implementation is [here](https://github.com/isl-org/DPT).
+## Framework
+<img src="../../figure/dpt_framework.png" alt="drawing" width="100%" height="100%"/>
+
+## Model Zoo ##
+### ADE20K ###
+|Model      | Backbone  | Batch_size | Iteration | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint     |     ConfigFile  |
+|-----------|-----------|------------|-----------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+|DPT        | ViT_Large |     16     |     160k   |  47.21   |       -        |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)      |[baidu](https://pan.baidu.com/s/1PCSC1Kvcg291gqp6h5pDCg)(ts7h)   |  [config](semantic_segmentation/configs/dpt/DPT_Large_480x480_160k_ade20k_bs_16.yaml)
+
+## Reference
+```
+@article{ranftl2021vision,
+  title={Vision transformers for dense prediction},
+  author={Ranftl, Ren{\'e} and Bochkovskiy, Alexey and Koltun, Vladlen},
+  journal={arXiv preprint arXiv:2103.13413},
+  year={2021}
+}
+```
diff --git a/semantic_segmentation/configs/segformer/README.md b/semantic_segmentation/configs/segformer/README.md
new file mode 100644
index 00000000..09de1b30
--- /dev/null
+++ b/semantic_segmentation/configs/segformer/README.md
@@ -0,0 +1,27 @@
+# SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers, [arxiv](https://arxiv.org/abs/2105.15203)
+
+The official pytorch implementation is [here](https://github.com/NVlabs/SegFormer)
+
+## Framework
+<img src="../../figure/segformer_framework.png" alt="drawing" width="100%" height="100%"/>
+
+## Model Zoo ##
+### ADE20K ###
+|Model      | Backbone  | Batch_size | Iteration | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint     |     ConfigFile  |
+|-----------|-----------|------------|-----------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+|Segformer  | MIT-B0 |     16     |     160k   |  38.37   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1WOD9jGjQRLnwKrRYzgBong)(ges9)   |  [config](segformer_mit-b0_512x512_160k_ade20k.yaml) |
+|Segformer  | MIT-B1 |     16     |     160k   |  42.20   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1aiSBXMd8nP82XK7sSZ05gg)(t4n4)   |  [config](segformer_mit-b1_512x512_160k_ade20k.yaml) |
+|Segformer  | MIT-B2 |     16     |     160k   |  46.38   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1wFFh-K5t46YktkfoWUOTAg)(h5ar)   |  [config](segformer_mit-b2_512x512_160k_ade20k.yaml) |
+|Segformer  | MIT-B3 |     16     |     160k   |  48.35   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1IwBnDeLNyKgs-xjhlaB9ug)(g9n4)   |  [config](segformer_mit-b3_512x512_160k_ade20k.yaml) |
+|Segformer  | MIT-B4 |     16     |     160k   |  49.01   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1a25fCVlwJ-1TUh9HQfx7YA)(e4xw)   |  [config](segformer_mit-b4_512x512_160k_ade20k.yaml) |
+|Segformer  | MIT-B5 |     16     |     160k   |  49.73   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/15kXXxKEjjtJv-BmrPnSTOw)(uczo)   |  [config](segformer_mit-b5_512x512_160k_ade20k.yaml) |
+
+## Reference
+```
+@article{xie2021segformer,
+  title={SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers},
+  author={Xie, Enze and Wang, Wenhai and Yu, Zhiding and Anandkumar, Anima and Alvarez, Jose M and Luo, Ping},
+  journal={arXiv preprint arXiv:2105.15203},
+  year={2021}
+}
+```
diff --git a/semantic_segmentation/configs/segformer/segformer_mit-b0_256x256_20k_vaihingen.yaml b/semantic_segmentation/configs/segformer/segformer_mit-b0_256x256_20k_vaihingen.yaml
new file mode 100644
index 00000000..906e0df0
--- /dev/null
+++ b/semantic_segmentation/configs/segformer/segformer_mit-b0_256x256_20k_vaihingen.yaml
@@ -0,0 +1,59 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'Vaihingen' # dataset name
+    DATA_PATH: 'G:\Datasets\Vaihingen'
+    CROP_SIZE: (256,256)  # input_size (training)
+    NUM_CLASSES: 6
+MODEL:
+    NAME: 'Segformer'
+    ENCODER:
+        TYPE: 'MixVisionTransformer'
+        OUT_INDICES: [0,1,2,3]
+    PRETRAINED: None
+    DECODER_TYPE: 'SegformerHead'
+    SEGFORMER:
+        IN_CHANNELS: [32, 64, 160, 256]     # BO is half of B1-B5
+        CHANNELS: 256
+        ALIGN_CORNERS: False
+    TRANS:
+        IN_CHANNELS: 3
+        EMBED_DIM: 32               # BO is half of B1-B5
+        NUM_STAGES: 4 
+        NUM_LAYERS: [2, 2, 2, 2]    # BO-B5 differs
+        NUM_HEADS: [1, 2, 5, 8]
+        PATCH_SIZE: [7, 3, 3, 3]
+        STRIDES: [4, 2, 2, 2]
+        SR_RATIOS: [8, 4, 2, 1]
+        HIDDEN_SIZE: 768 
+        MLP_RATIO: 4             
+        QKV_BIAS: True
+    DROPOUT: 0.0
+    ATTENTION_DROPOUT: 0.0
+    DROP_PATH: 0.1
+
+TRAIN:
+    BASE_LR: 0.00006
+    END_LR: 0.0
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 2000
+    WEIGHT_DECAY: 0.01
+    POWER: 1.0
+    DECAY_STEPS: 2000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'AdamW'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: [2048, 1536]
+    CROP_SIZE: [256, 256]
+    STRIDE_SIZE: [171, 171]
+    MEAN: [123.675, 116.28, 103.53]
+    STD: [58.395, 57.12, 57.375]
+SAVE_DIR: "./output/segformer_mit-b0_256x256_20k_vaihingen"
+
+        
+
diff --git a/semantic_segmentation/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.yaml b/semantic_segmentation/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.yaml
new file mode 100644
index 00000000..b1c2af23
--- /dev/null
+++ b/semantic_segmentation/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.yaml
@@ -0,0 +1,54 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/aistudio/data/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segformer'
+    ENCODER:
+        TYPE: 'MixVisionTransformer'
+        OUT_INDICES: [0,1,2,3]
+    PRETRAINED: None
+    DECODER_TYPE: 'SegformerHead'
+    SEGFORMER:
+        IN_CHANNELS: [32, 64, 160, 256]    # BO is half of B1-B5
+        CHANNELS: 256
+        ALIGN_CORNERS: False
+    TRANS:
+        IN_CHANNELS: 3
+        EMBED_DIM: 32               # BO is half of B1-B5
+        NUM_STAGES: 4 
+        NUM_LAYERS: [2, 2, 2, 2]    # BO-B5 differs
+        NUM_HEADS: [1, 2, 5, 8]
+        PATCH_SIZE: [7, 3, 3, 3]
+        STRIDES: [4, 2, 2, 2]
+        SR_RATIOS: [8, 4, 2, 1]
+        HIDDEN_SIZE: 768 
+        MLP_RATIO: 4             
+        QKV_BIAS: True
+    DROPOUT: 0.0
+    ATTENTION_DROPOUT: 0.0
+    DROP_PATH: 0.1
+
+TRAIN:
+    BASE_LR: 0.00006
+    END_LR: 0.0
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 2000
+    WEIGHT_DECAY: 0.01
+    POWER: 1.0
+    DECAY_STEPS: 2000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'AdamW'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512                                                                                                                                                                                            
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+SAVE_DIR: "./output/segformer_mit-b0_512x512_160k_ade20k"
diff --git a/semantic_segmentation/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.yaml b/semantic_segmentation/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.yaml
new file mode 100644
index 00000000..7857425a
--- /dev/null
+++ b/semantic_segmentation/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.yaml
@@ -0,0 +1,54 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/aistudio/data/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segformer'
+    ENCODER:
+        TYPE: 'MixVisionTransformer'
+        OUT_INDICES: [0,1,2,3]
+    PRETRAINED: None
+    DECODER_TYPE: 'SegformerHead'
+    SEGFORMER:
+        IN_CHANNELS: [64, 128, 320, 512]     # BO is half of B1-B5
+        CHANNELS: 256
+        ALIGN_CORNERS: False
+    TRANS:
+        IN_CHANNELS: 3
+        EMBED_DIM: 64               # BO is half of B1-B5
+        NUM_STAGES: 4 
+        NUM_LAYERS: [2, 2, 2, 2]    # BO-B5 differs
+        NUM_HEADS: [1, 2, 5, 8]
+        PATCH_SIZE: [7, 3, 3, 3]
+        STRIDES: [4, 2, 2, 2]
+        SR_RATIOS: [8, 4, 2, 1]
+        HIDDEN_SIZE: 768 
+        MLP_RATIO: 4             
+        QKV_BIAS: True
+    DROPOUT: 0.0
+    ATTENTION_DROPOUT: 0.0
+    DROP_PATH: 0.1
+
+TRAIN:
+    BASE_LR: 0.00006
+    END_LR: 0.0
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 2000
+    WEIGHT_DECAY: 0.01
+    POWER: 1.0
+    DECAY_STEPS: 2000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'AdamW'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512                                                                                                                                                                                            
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+SAVE_DIR: "./output/segformer_mit-b1_512x512_160k_ade20k"
diff --git a/semantic_segmentation/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.yaml b/semantic_segmentation/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.yaml
new file mode 100644
index 00000000..82638ea0
--- /dev/null
+++ b/semantic_segmentation/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.yaml
@@ -0,0 +1,54 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/aistudio/data/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segformer'
+    ENCODER:
+        TYPE: 'MixVisionTransformer'
+        OUT_INDICES: [0,1,2,3]
+    PRETRAINED: None
+    DECODER_TYPE: 'SegformerHead'
+    SEGFORMER:
+        IN_CHANNELS: [64, 128, 320, 512]     # BO is half of B1-B5
+        CHANNELS: 256
+        ALIGN_CORNERS: False
+    TRANS:
+        IN_CHANNELS: 3
+        EMBED_DIM: 64               # BO is half of B1-B5
+        NUM_STAGES: 4 
+        NUM_LAYERS: [3, 4, 6, 3]    # BO-B5 differs
+        NUM_HEADS: [1, 2, 5, 8]
+        PATCH_SIZE: [7, 3, 3, 3]
+        STRIDES: [4, 2, 2, 2]
+        SR_RATIOS: [8, 4, 2, 1]
+        HIDDEN_SIZE: 768 
+        MLP_RATIO: 4             
+        QKV_BIAS: True
+    DROPOUT: 0.0
+    ATTENTION_DROPOUT: 0.0
+    DROP_PATH: 0.1
+
+TRAIN:
+    BASE_LR: 0.00006
+    END_LR: 0.0
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 2000
+    WEIGHT_DECAY: 0.01
+    POWER: 1.0
+    DECAY_STEPS: 2000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'AdamW'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512                                                                                                                                                                                            
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+SAVE_DIR: "./output/segformer_mit-b2_512x512_160k_ade20k"
diff --git a/semantic_segmentation/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.yaml b/semantic_segmentation/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.yaml
new file mode 100644
index 00000000..892fb0b9
--- /dev/null
+++ b/semantic_segmentation/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.yaml
@@ -0,0 +1,54 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/aistudio/data/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segformer'
+    ENCODER:
+        TYPE: 'MixVisionTransformer'
+        OUT_INDICES: [0,1,2,3]
+    PRETRAINED: None
+    DECODER_TYPE: 'SegformerHead'
+    SEGFORMER:
+        IN_CHANNELS: [64, 128, 320, 512]     # BO is half of B1-B5
+        CHANNELS: 256
+        ALIGN_CORNERS: False
+    TRANS:
+        IN_CHANNELS: 3
+        EMBED_DIM: 64               # BO is half of B1-B5
+        NUM_STAGES: 4 
+        NUM_LAYERS: [3, 4, 18, 3]    # BO-B5 differs
+        NUM_HEADS: [1, 2, 5, 8] 
+        PATCH_SIZE: [7, 3, 3, 3]
+        STRIDES: [4, 2, 2, 2]
+        SR_RATIOS: [8, 4, 2, 1]
+        HIDDEN_SIZE: 768 
+        MLP_RATIO: 4             
+        QKV_BIAS: True
+    DROPOUT: 0.0
+    ATTENTION_DROPOUT: 0.0
+    DROP_PATH: 0.1
+
+TRAIN:
+    BASE_LR: 0.00006
+    END_LR: 0.0
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 2000
+    WEIGHT_DECAY: 0.01
+    POWER: 1.0
+    DECAY_STEPS: 2000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'AdamW'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512                                                                                                                                                                                            
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+SAVE_DIR: "./output/segformer_mit-b3_512x512_160k_ade20k"
diff --git a/semantic_segmentation/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.yaml b/semantic_segmentation/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.yaml
new file mode 100644
index 00000000..7984534c
--- /dev/null
+++ b/semantic_segmentation/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.yaml
@@ -0,0 +1,54 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/aistudio/data/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segformer'
+    ENCODER:
+        TYPE: 'MixVisionTransformer'
+        OUT_INDICES: [0,1,2,3]
+    PRETRAINED: None
+    DECODER_TYPE: 'SegformerHead'
+    SEGFORMER:
+        IN_CHANNELS: [64, 128, 320, 512]     # BO is half of B1-B5
+        CHANNELS: 256
+        ALIGN_CORNERS: False
+    TRANS:
+        IN_CHANNELS: 3
+        EMBED_DIM: 64               # BO is half of B1-B5
+        NUM_STAGES: 4 
+        NUM_LAYERS: [3, 8, 27, 3]    # BO-B5 differs
+        NUM_HEADS: [1, 2, 5, 8]
+        PATCH_SIZE: [7, 3, 3, 3]
+        STRIDES: [4, 2, 2, 2]
+        SR_RATIOS: [8, 4, 2, 1]
+        HIDDEN_SIZE: 768 
+        MLP_RATIO: 4             
+        QKV_BIAS: True
+    DROPOUT: 0.0
+    ATTENTION_DROPOUT: 0.0
+    DROP_PATH: 0.1
+
+TRAIN:
+    BASE_LR: 0.00006
+    END_LR: 0.0
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 2000
+    WEIGHT_DECAY: 0.01
+    POWER: 1.0
+    DECAY_STEPS: 2000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'AdamW'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512                                                                                                                                                                                            
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+SAVE_DIR: "./output/segformer_mit-b4_512x512_160k_ade20k"
diff --git a/semantic_segmentation/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.yaml b/semantic_segmentation/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.yaml
new file mode 100644
index 00000000..fde4eed4
--- /dev/null
+++ b/semantic_segmentation/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.yaml
@@ -0,0 +1,54 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/aistudio/data/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segformer'
+    ENCODER:
+        TYPE: 'MixVisionTransformer'
+        OUT_INDICES: [0,1,2,3]
+    PRETRAINED: None
+    DECODER_TYPE: 'SegformerHead'
+    SEGFORMER:
+        IN_CHANNELS: [64, 128, 320, 512]     # BO is half of B1-B5
+        CHANNELS: 256
+        ALIGN_CORNERS: False
+    TRANS:
+        IN_CHANNELS: 3
+        EMBED_DIM: 64               # BO is half of B1-B5
+        NUM_STAGES: 4 
+        NUM_LAYERS: [3, 6, 40, 3]    # BO-B5 differs
+        NUM_HEADS: [1, 2, 5, 8]
+        PATCH_SIZE: [7, 3, 3, 3]
+        STRIDES: [4, 2, 2, 2]
+        SR_RATIOS: [8, 4, 2, 1]
+        HIDDEN_SIZE: 768 
+        MLP_RATIO: 4             
+        QKV_BIAS: True
+    DROPOUT: 0.0
+    ATTENTION_DROPOUT: 0.0
+    DROP_PATH: 0.1
+
+TRAIN:
+    BASE_LR: 0.00006
+    END_LR: 0.0
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 2000
+    WEIGHT_DECAY: 0.01
+    POWER: 1.0
+    DECAY_STEPS: 2000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'AdamW'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512                                                                                                                                                                                            
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+SAVE_DIR: "./output/segformer_mit-b5_512x512_160k_ade20k"
diff --git a/semantic_segmentation/configs/segmenter/README.md b/semantic_segmentation/configs/segmenter/README.md
new file mode 100644
index 00000000..d432afa5
--- /dev/null
+++ b/semantic_segmentation/configs/segmenter/README.md
@@ -0,0 +1,25 @@
+# Segmenter: Transformer for Semantic Segmentation, [arxiv](https://arxiv.org/pdf/2105.05633.pdf)
+
+The official pytorch implementation is [here](https://github.com/rstrudel/segmenter)
+## Framework
+<img src="../../figure/segmenter_framework.png" alt="drawing" width="100%" height="100%"/>
+
+## Model Zoo ##
+### ADE20K ###
+|Model      | Backbone  | Batch_size | Iteration | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint     |     ConfigFile  |
+|-----------|-----------|------------|-----------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+|Segmenter  | ViT_Tiny  |     16     |     160k   |  38.45   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1nZptBc-IY_3PFramXSlovQ)(1k97)   |  [config](semantic_segmentation/configs/segmenter/segmenter_Tiny_512x512_160k_ade20k_bs_16.yaml)
+|Segmenter  | ViT_Small |     16     |     160k   |  46.07   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1gKE-GEu7gX6dJsgtlvrmWg)(i8nv)   |  [config](semantic_segmentation/configs/segmenter/segmenter_small_512x512_160k_ade20k_bs_16.yaml)
+|Segmenter  | ViT_Base  |     16     |     160k   |  49.08   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1qb7HEtKW0kBSP6iv-r_Hjg)(hxrl)   |  [config](semantic_segmentation/configs/segmenter/segmenter_Base_512x512_160k_ade20k_bs_16.yaml) |
+|Segmenter_Linear  | DeiT_Base |     16     |     160k   |  47.34   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1Hk_zcXUIt_h5sKiAjG2Pog)(5dpv)   |  [config](semantic_segmentation/configs/segmenter/segmenter_Base_distilled_512x512_160k_ade20k_bs_16.yaml)
+|Segmenter  | DeiT_Base |     16     |     160k   |  49.27   |       -        |   TODO      |[baidu](https://pan.baidu.com/s/1-TBUuvcBKNgetSJr0CsAHA)(3kim)   |  [config](semantic_segmentation/configs/segmenter/segmenter_Base_distilled_512x512_160k_ade20k_bs_16.yaml) |
+
+## Reference
+```
+@article{strudel2021,
+  title={Segmenter: Transformer for Semantic Segmentation},
+  author={Strudel, Robin and Garcia, Ricardo and Laptev, Ivan and Schmid, Cordelia},
+  journal={arXiv preprint arXiv:2105.05633},
+  year={2021}
+}
+```
diff --git a/semantic_segmentation/configs/segmenter/segmenter_Large_480x480_160k_pascal_content_bs_16.yaml b/semantic_segmentation/configs/segmenter/segmenter_Large_480x480_160k_pascal_content_bs_16.yaml
new file mode 100644
index 00000000..c55da3e6
--- /dev/null
+++ b/semantic_segmentation/configs/segmenter/segmenter_Large_480x480_160k_pascal_content_bs_16.yaml
@@ -0,0 +1,50 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'PascalContext' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/pascal_context'
+    CROP_SIZE: (480,480)
+    NUM_CLASSES: 60
+MODEL:
+    NAME: 'Segmenter'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [23] 
+    PRETRAINED: None
+    DECODER_TYPE: 'MaskTransformer'
+    SEGMENTER:
+      NUM_LAYERS: 2
+    NUM_CLASSES: 60
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 192(tiny), 384(small) 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 3(tiny), 6(small), 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(tiny), 12(small), 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+        KEEP_CLS_TOKEN: True
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 520
+    CROP_SIZE: [480, 480]
+    STRIDE_SIZE: [320, 320]
+    MEAN: [127.5, 127.5, 127.5]
+    STD: [127.5, 127.5, 127.5]
+SAVE_DIR: "./output/segmenter_base_512x512_160k_pascal_content_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/segmenter/segmenter_Large_512x512_160k_ade20k_bs_16.yaml b/semantic_segmentation/configs/segmenter/segmenter_Large_512x512_160k_ade20k_bs_16.yaml
new file mode 100644
index 00000000..df677dd8
--- /dev/null
+++ b/semantic_segmentation/configs/segmenter/segmenter_Large_512x512_160k_ade20k_bs_16.yaml
@@ -0,0 +1,50 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/aistudio/data/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segmenter'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [23] 
+    PRETRAINED: None
+    DECODER_TYPE: 'MaskTransformer'
+    SEGMENTER:
+      NUM_LAYERS: 2
+    NUM_CLASSES: 150
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 192(tiny), 384(small) 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 3(tiny), 6(small), 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(tiny), 12(small), 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+        KEEP_CLS_TOKEN: True
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+    MEAN: [127.5, 127.5, 127.5]
+    STD: [127.5, 127.5, 127.5]
+SAVE_DIR: "./output/segmenter_base_512x512_160k_ade20k_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/segmenter/segmenter_base_512x512_160k_ade20k_bs_16.yaml b/semantic_segmentation/configs/segmenter/segmenter_base_512x512_160k_ade20k_bs_16.yaml
new file mode 100644
index 00000000..8548c9f1
--- /dev/null
+++ b/semantic_segmentation/configs/segmenter/segmenter_base_512x512_160k_ade20k_bs_16.yaml
@@ -0,0 +1,50 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/aistudio/data/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segmenter'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [11] 
+    PRETRAINED: None
+    DECODER_TYPE: 'MaskTransformer'
+    SEGMENTER:
+      NUM_LAYERS: 2
+    NUM_CLASSES: 150
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 768 # 192(tiny), 384(small) 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 12    # 3(tiny), 6(small), 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 12   # 12(tiny), 12(small), 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+        KEEP_CLS_TOKEN: True
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+    MEAN: [127.5, 127.5, 127.5]
+    STD: [127.5, 127.5, 127.5]
+SAVE_DIR: "./output/segmenter_base_512x512_160k_ade20k_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/segmenter/segmenter_base_distilled_512x512_160k_ade20k_bs_16.yaml b/semantic_segmentation/configs/segmenter/segmenter_base_distilled_512x512_160k_ade20k_bs_16.yaml
new file mode 100644
index 00000000..8aed0d1b
--- /dev/null
+++ b/semantic_segmentation/configs/segmenter/segmenter_base_distilled_512x512_160k_ade20k_bs_16.yaml
@@ -0,0 +1,48 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/aistudio/data/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segmenter'
+    ENCODER:
+        TYPE: 'DeiT'
+        OUT_INDICES: [11] 
+    PRETRAINED: None
+    DECODER_TYPE: 'MaskTransformer'
+    SEGMENTER:
+      NUM_LAYERS: 2
+    NUM_CLASSES: 150
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 768 # 192(tiny), 384(small) 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 12    # 3(tiny), 6(small), 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 12   # 12(tiny), 12(small), 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+        KEEP_CLS_TOKEN: True
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512                                                                                                                                                                                            
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+SAVE_DIR: "./output/segmenter_base_distilled_512x512_160k_ade20k_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/segmenter/segmenter_base_distilled_linear_512x512_160k_ade20k_bs_16.yaml b/semantic_segmentation/configs/segmenter/segmenter_base_distilled_linear_512x512_160k_ade20k_bs_16.yaml
new file mode 100644
index 00000000..bb22d05d
--- /dev/null
+++ b/semantic_segmentation/configs/segmenter/segmenter_base_distilled_linear_512x512_160k_ade20k_bs_16.yaml
@@ -0,0 +1,48 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/aistudio/data/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segmenter'
+    ENCODER:
+        TYPE: 'DeiT'
+        OUT_INDICES: [11] 
+    PRETRAINED: None
+    DECODER_TYPE: 'LinearDecoder'
+    SEGMENTER:
+      NUM_LAYERS: 2
+    NUM_CLASSES: 150
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 768 # 192(tiny), 384(small) 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 12    # 3(tiny), 6(small), 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 12   # 12(tiny), 12(small), 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+        KEEP_CLS_TOKEN: True
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512                                                                                                                                                                                            
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+SAVE_DIR: "./output/segmenter_base_distilled_linear_512x512_160k_ade20k_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/segmenter/segmenter_base_linear_256x256_20k_vaihingen_bs_16.yaml b/semantic_segmentation/configs/segmenter/segmenter_base_linear_256x256_20k_vaihingen_bs_16.yaml
new file mode 100644
index 00000000..8d4cc65e
--- /dev/null
+++ b/semantic_segmentation/configs/segmenter/segmenter_base_linear_256x256_20k_vaihingen_bs_16.yaml
@@ -0,0 +1,50 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'Vaihingen' # dataset name
+    DATA_PATH: 'G:\Datasets\Vaihingen'
+    CROP_SIZE: (256,256)  # input_size (training)
+    NUM_CLASSES: 6
+MODEL:
+    NAME: 'Segmenter'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [11] 
+    PRETRAINED: 'pretrain_models\backbones\vit_base_patch16_224.pdparams'
+    DECODER_TYPE: 'Linear'
+    SEGMENTER:
+      NUM_LAYERS: 2
+    NUM_CLASSES: 6
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 768 # 192(tiny), 384(small) 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 12    # 3(tiny), 6(small), 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 12   # 12(tiny), 12(small), 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+        KEEP_CLS_TOKEN: True
+TRAIN:
+    BASE_LR: 0.00006
+    END_LR: 0.0
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 20000
+    WEIGHT_DECAY: 0.01
+    POWER: 1.0
+    DECAY_STEPS: 20000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'AdamW'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: [2048, 1536]
+    CROP_SIZE: [256, 256]
+    STRIDE_SIZE: [171, 171]
+    MEAN: [123.675, 116.28, 103.53]
+    STD: [58.395, 57.12, 57.375]
+SAVE_DIR: "./output/segmenter_base_linear_256x256_20k_vaihingen_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/segmenter/segmenter_small_512x512_160k_ade20k_bs_16.yaml b/semantic_segmentation/configs/segmenter/segmenter_small_512x512_160k_ade20k_bs_16.yaml
new file mode 100644
index 00000000..3edea0b4
--- /dev/null
+++ b/semantic_segmentation/configs/segmenter/segmenter_small_512x512_160k_ade20k_bs_16.yaml
@@ -0,0 +1,51 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segmenter'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [11] 
+    PRETRAINED: None
+    DECODER_TYPE: 'MaskTransformer'
+    SEGMENTER:
+      NUM_LAYERS: 2
+    NUM_CLASSES: 150
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 384 # 192(tiny), 384(small) 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 6     # 3(tiny), 6(small), 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 12   # 12(tiny), 12(small), 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+        KEEP_CLS_TOKEN: True
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    IMAGE_BASE_SIZE: 512
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+    MEAN: [127.5, 127.5, 127.5]
+    STD: [127.5, 127.5, 127.5]
+SAVE_DIR: "./output/segmenter_small_512x512_160k_ade20k_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/segmenter/segmenter_tiny_512x512_160k_ade20k_bs_16.yaml b/semantic_segmentation/configs/segmenter/segmenter_tiny_512x512_160k_ade20k_bs_16.yaml
new file mode 100644
index 00000000..fe9556be
--- /dev/null
+++ b/semantic_segmentation/configs/segmenter/segmenter_tiny_512x512_160k_ade20k_bs_16.yaml
@@ -0,0 +1,50 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/aistudio/data/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'Segmenter'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [11] 
+    PRETRAINED: None
+    DECODER_TYPE: 'MaskTransformer'
+    SEGMENTER:
+      NUM_LAYERS: 2
+    NUM_CLASSES: 150
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 192 # 192(tiny), 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 3    # 3(tiny), 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 12   # 12(tiny) 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+        KEEP_CLS_TOKEN: True
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512                                                                                                                                                                                            
+    CROP_SIZE: [512, 512]
+    STRIDE_SIZE: [512, 512]
+    MEAN: [127.5, 127.5, 127.5]
+    STD: [127.5, 127.5, 127.5]
+SAVE_DIR: "./output/segmenter_tiny_512x512_160k_ade20k_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/README.md b/semantic_segmentation/configs/setr/README.md
new file mode 100644
index 00000000..3c60168f
--- /dev/null
+++ b/semantic_segmentation/configs/setr/README.md
@@ -0,0 +1,45 @@
+# Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers, [arxiv](https://arxiv.org/pdf/2012.15840.pdf)
+
+The official pytorch implementation is [here](https://github.com/fudan-zvg/SETR).
+## Framework
+<img src="../../figure/setr_framework.png" alt="drawing" width="100%" height="100%"/>
+
+## Model Zoo ##
+### Pascal Context ###
+|Model      | Backbone  | Batch_size | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint      |     ConfigFile  |
+|-----------|-----------|------------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+|SETR_Naive | ViT_large |     16     |   52.06   |      52.57        | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)     | [google](https://drive.google.com/file/d/1AUyBLeoAcMH0P_QGer8tdeU44muTUOCA/view?usp=sharing)/[baidu](https://pan.baidu.com/s/11XgmgYG071n_9fSGUcPpDQ)(xdb8)   | [config](./SETR_Naive_Large_480x480_80k_pascal_context_bs_16.yaml) | 
+|SETR_PUP   | ViT_large |     16     |   53.90   |       54.53    | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)     | [google](https://drive.google.com/file/d/1IY-yBIrDPg5CigQ18-X2AX6Oq3rvWeXL/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1v6ll68fDNCuXUIJT2Cxo-A)(6sji) | [config](./SETR_PUP_Large_480x480_80k_pascal_context_bs_16.yaml) |
+|SETR_MLA   | ViT_Large |     8      |   54.39   |       55.16       | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)     | [google](https://drive.google.com/file/d/1utU2h0TrtuGzRX5RMGroudiDcz0z6UmV/view)/[baidu](https://pan.baidu.com/s/1Eg0eyUQXc-Mg5fg0T3RADA)(wora)| [config](./SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml) |
+|SETR_MLA   | ViT_large |     16     |   55.01   |       55.87        | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)     | [google](https://drive.google.com/file/d/1SOXB7sAyysNhI8szaBqtF8ZoxSaPNvtl/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1jskpqYbazKY1CKK3iVxAYA)(76h2) | [config](./SETR_MLA_Large_480x480_80k_pascal_context_bs_16.yaml) |
+
+### Cityscapes ###
+|Model      | Backbone  | Batch_size | Iteration | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint     |     ConfigFile  |
+|-----------|-----------|------------|-----------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+|SETR_Naive | ViT_Large |     8      |     40k   |   76.71   |       79.03        | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)      | [google](https://drive.google.com/file/d/1QialLNMmvWW8oi7uAHhJZI3HSOavV4qj/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1F3IB31QVlsohqW8cRNphqw)(g7ro)  |  [config](./SETR_Naive_Large_768x768_40k_cityscapes_bs_8.yaml)| 
+|SETR_Naive | ViT_Large |     8      |     80k   |   77.31   |       79.43      | [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)      | [google](https://drive.google.com/file/d/1RJeSGoDaOP-fM4p1_5CJxS5ku_yDXXLV/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1XbHPBfaHS56HlaMJmdJf1A)(wn6q)   |  [config](./SETR_Naive_Large_768x768_80k_cityscapes_bs_8.yaml)| 
+|SETR_PUP   | ViT_Large |     8      |     40k   |   77.92   |       79.63        |  [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)     | [google](https://drive.google.com/file/d/12rMFMOaOYSsWd3f1hkrqRc1ThNT8K8NG/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1H8b3valvQ2oLU9ZohZl_6Q)(zmoi)    | [config](./SETR_PUP_Large_768x768_40k_cityscapes_bs_8.yaml)| 
+|SETR_PUP   | ViT_Large |     8      |     80k   |   78.81   |       80.43     |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1tkMhRzO0XHqKYM0lojE3_g)(f793)    | [config](./SETR_PUP_Large_768x768_80k_cityscapes_bs_8.yaml)| 
+|SETR_MLA   | ViT_Large |     8      |     40k   |   76.70    |       78.96      |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1sUug5cMKSo6mO7BEI4EV_w)(qaiw)    | [config](./SETR_MLA_Large_768x768_40k_cityscapes_bs_8.yaml)| 
+|SETR_MLA   | ViT_Large |     8      |     80k   |  77.26     |       79.27      |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1IqPZ6urdQb_0pbdJW2i3ow)(6bgj)    | [config](./SETR_MLA_Large_768x768_80k_cityscapes_bs_8.yaml)| 
+
+
+### ADE20K ###
+|Model      | Backbone  | Batch_size | Iteration | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint     |     ConfigFile  |
+|-----------|-----------|------------|-----------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+|SETR_Naive | ViT_Large |     16      |     160k   | 47.57   |      48.12        |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1_AY6BMluNn71UiMNZbnKqQ)(lugq)   | [config](./SETR_Naive_Large_512x512_160k_ade20k_bs_16.yaml)| 
+|SETR_PUP   | ViT_Large |     16      |     160k   |  49.12   |       -        |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1N83rG0EZSksMGZT3njaspg)(udgs)    | [config](./SETR_PUP_Large_512x512_160k_ade20k_bs_16.yaml)| 
+|SETR_MLA   | ViT_Large |     8      |     160k   |  47.80   |       -        |   [google](https://drive.google.com/file/d/1TPgh7Po6ayYb1DksJeZp60LGnNyznr-r/view?usp=sharing)/[baidu](https://pan.baidu.com/s/18WSi8Jp3tCZgv_Vr3V1i7A)(owoj)    | [baidu](https://pan.baidu.com/s/1L83sdXWL4XT02dvH2WFzCA)(mrrv)    | [config](./SETR_MLA_Large_512x512_160k_ade20k_bs_8.yaml)| 
+
+
+## Reference
+```
+@inproceedings{zheng2021rethinking,
+  title={Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers},
+  author={Zheng, Sixiao and Lu, Jiachen and Zhao, Hengshuang and Zhu, Xiatian and Luo, Zekun and Wang, Yabiao and Fu, Yanwei and Feng, Jianfeng and Xiang, Tao and Torr, Philip HS and others},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={6881--6890},
+  year={2021}
+}
+```
+
diff --git a/semantic_segmentation/configs/setr/SETR_MLA_Large_480x480_80k_pascal_context_bs_16.yaml b/semantic_segmentation/configs/setr/SETR_MLA_Large_480x480_80k_pascal_context_bs_16.yaml
new file mode 100644
index 00000000..ad8912a2
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_MLA_Large_480x480_80k_pascal_context_bs_16.yaml
@@ -0,0 +1,53 @@
+DATA:
+    BATCH_SIZE: 4  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'PascalContext' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/pascal_context'
+    CROP_SIZE: (480,480)
+    NUM_CLASSES: 60
+MODEL:
+    NAME: 'SETR_MLA'
+    ENCODER:
+        TYPE: 'ViT_MLA'
+        OUT_INDICES: [5, 11, 17, 23]   # Base: [2, 5, 8, 11]; Large: [5, 11, 17, 23] 
+    DECODER_TYPE: 'VIT_MLAHead'
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    MLA:
+        MLA_CHANNELS: 256
+        MLAHEAD_CHANNELS: 128
+        MLAHEAD_ALIGN_CORNERS: False
+    AUX:
+        AUXIHEAD: True
+        AUXHEAD_ALIGN_CORNERS: False
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 80000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 80000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    IMAGE_BASE_SIZE: 520
+    CROP_SIZE: [480,480]
+    STRIDE_SIZE: [320,320]
+
+SAVE_DIR: "./output/SETR_MLA_Large_480x480_80k_pascal_context_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml b/semantic_segmentation/configs/setr/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml
new file mode 100644
index 00000000..c4e05d03
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml
@@ -0,0 +1,53 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'PascalContext' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/pascal_context'
+    CROP_SIZE: (480,480)
+    NUM_CLASSES: 60
+MODEL:
+    NAME: 'SETR_MLA'
+    ENCODER:
+        TYPE: 'ViT_MLA'
+        OUT_INDICES: [5, 11, 17, 23] # Base: [2, 5, 8, 11]; Large: [5, 11, 17, 23] 
+    DECODER_TYPE: 'VIT_MLAHead'
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    MLA:
+        MLA_CHANNELS: 256
+        MLAHEAD_CHANNELS: 128
+        MLAHEAD_ALIGN_CORNERS: False
+    AUX:
+        AUXIHEAD: True
+        AUXHEAD_ALIGN_CORNERS: False
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 80000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 80000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    IMAGE_BASE_SIZE: 520
+    CROP_SIZE: [480,480]
+    STRIDE_SIZE: [320,320] 
+
+SAVE_DIR: "./output/SETR_MLA_Large_480x480_80k_pascal_context_bs_8"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_MLA_Large_512x512_160k_ade20k_bs_8.yaml b/semantic_segmentation/configs/setr/SETR_MLA_Large_512x512_160k_ade20k_bs_8.yaml
new file mode 100644
index 00000000..194c81f6
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_MLA_Large_512x512_160k_ade20k_bs_8.yaml
@@ -0,0 +1,53 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/ADEChallengeData2016'
+    CROP_SIZE: (512,512)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'SETR_MLA'
+    ENCODER:
+        TYPE: 'ViT_MLA'
+        OUT_INDICES: [5, 11, 17, 23] # Base: [2, 5, 8, 11]; Large: [5, 11, 17, 23] 
+    DECODER_TYPE: 'VIT_MLAHead'
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    MLA:
+        MLA_CHANNELS: 256
+        MLAHEAD_CHANNELS: 128
+        MLAHEAD_ALIGN_CORNERS: False
+    AUX:
+        AUXIHEAD: True
+        AUXHEAD_ALIGN_CORNERS: False
+TRAIN:
+    BASE_LR: 0.01
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 80000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 80000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25]
+    IMAGE_BASE_SIZE: 576   # (2048,512)
+    CROP_SIZE: [512,512]
+    STRIDE_SIZE: [341,341] 
+
+SAVE_DIR: "./output/SETR_MLA_Large_512x512_160k_ade20k_bs_8"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_MLA_Large_768x768_40k_cityscapes_bs_8.yaml b/semantic_segmentation/configs/setr/SETR_MLA_Large_768x768_40k_cityscapes_bs_8.yaml
new file mode 100644
index 00000000..e1053f2f
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_MLA_Large_768x768_40k_cityscapes_bs_8.yaml
@@ -0,0 +1,53 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'Cityscapes' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/cityscapes'
+    CROP_SIZE: (768,768)
+    NUM_CLASSES: 19
+MODEL:
+    NAME: 'SETR_MLA'
+    ENCODER:
+        TYPE: 'ViT_MLA'
+        OUT_INDICES: [5, 11, 17, 23]   # Base: [2, 5, 8, 11]; Large: [5, 11, 17, 23] 
+    DECODER_TYPE: 'VIT_MLAHead'
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    MLA:
+        MLA_CHANNELS: 256
+        MLAHEAD_CHANNELS: 128
+        MLAHEAD_ALIGN_CORNERS: False
+    AUX:
+        AUXIHEAD: True
+        AUXHEAD_ALIGN_CORNERS: False
+TRAIN:
+    BASE_LR: 0.01
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 40000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 40000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    IMAGE_BASE_SIZE: 1024
+    CROP_SIZE: [768,768]
+    STRIDE_SIZE: [512,512]
+
+SAVE_DIR: "./output/SETR_MLA_Large_768x768_40k_cityscapes_bs_8"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_MLA_Large_768x768_80k_cityscapes_bs_8.yaml b/semantic_segmentation/configs/setr/SETR_MLA_Large_768x768_80k_cityscapes_bs_8.yaml
new file mode 100644
index 00000000..896bb73d
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_MLA_Large_768x768_80k_cityscapes_bs_8.yaml
@@ -0,0 +1,53 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'Cityscapes' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/cityscapes'
+    CROP_SIZE: (768,768)
+    NUM_CLASSES: 19
+MODEL:
+    NAME: 'SETR_MLA'
+    ENCODER:
+        TYPE: 'ViT_MLA'
+        OUT_INDICES: [5, 11, 17, 23]   # Base: [2, 5, 8, 11]; Large: [5, 11, 17, 23] 
+    DECODER_TYPE: 'VIT_MLAHead'
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    MLA:
+        MLA_CHANNELS: 256
+        MLAHEAD_CHANNELS: 128
+        MLAHEAD_ALIGN_CORNERS: False
+    AUX:
+        AUXIHEAD: True
+        AUXHEAD_ALIGN_CORNERS: False
+TRAIN:
+    BASE_LR: 0.01
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 80000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 80000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    IMAGE_BASE_SIZE: 1024
+    CROP_SIZE: [768,768]
+    STRIDE_SIZE: [512,512]
+
+SAVE_DIR: "./output/SETR_MLA_Large_768x768_80k_cityscapes_bs_8"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_Naive_Large_480x480_80k_pascal_context_bs_16.yaml b/semantic_segmentation/configs/setr/SETR_Naive_Large_480x480_80k_pascal_context_bs_16.yaml
new file mode 100644
index 00000000..af4b61e5
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_Naive_Large_480x480_80k_pascal_context_bs_16.yaml
@@ -0,0 +1,59 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'PascalContext' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/pascal_context'
+    CROP_SIZE: (480,480)  # input_size (training)
+    NUM_CLASSES: 60
+MODEL:
+    NAME: 'SETR_Naive'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [9, 14, 19, 23]   
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    DECODER_TYPE: 'Naive_VisionTransformerUpHead'
+    PUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 1
+       CONV3x3_CONV1x1: False
+       ALIGN_CORNERS: False
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    AUXPUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 1
+       CONV3x3_CONV1x1: False
+       ALIGN_CORNERS: False
+    AUX:
+       AUXIHEAD: True
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 80000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 80000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]                                                                                                                                                                                           
+    IMAGE_BASE_SIZE: 520
+    CROP_SIZE: [480,480]
+    STRIDE_SIZE: [320,320]
+SAVE_DIR: "./output/SETR_Naive_Large_480x480_80k_pascal_context_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_Naive_Large_512x512_160k_ade20k_bs_16.yaml b/semantic_segmentation/configs/setr/SETR_Naive_Large_512x512_160k_ade20k_bs_16.yaml
new file mode 100644
index 00000000..b8ff0992
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_Naive_Large_512x512_160k_ade20k_bs_16.yaml
@@ -0,0 +1,59 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'SETR_Naive'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [9, 14, 19, 23]   
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    DECODER_TYPE: 'Naive_VisionTransformerUpHead'
+    PUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 1
+       CONV3x3_CONV1x1: False
+       ALIGN_CORNERS: False
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    AUXPUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 1
+       CONV3x3_CONV1x1: False
+       ALIGN_CORNERS: False
+    AUX:
+       AUXIHEAD: True
+TRAIN:
+    BASE_LR: 0.01
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0]                                                                                                                                                                                                            
+    IMAGE_BASE_SIZE: 576
+    CROP_SIZE: [512,512]
+    STRIDE_SIZE: [341,341]
+SAVE_DIR: "./output/SETR_Naive_Large_512x512_160k_ade20k_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_Naive_Large_768x768_40k_cityscapes_bs_8.yaml b/semantic_segmentation/configs/setr/SETR_Naive_Large_768x768_40k_cityscapes_bs_8.yaml
new file mode 100644
index 00000000..e40f4ddb
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_Naive_Large_768x768_40k_cityscapes_bs_8.yaml
@@ -0,0 +1,59 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'Cityscapes' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/cityscapes'
+    CROP_SIZE: (768,768)  # input_size (training)
+    NUM_CLASSES: 19
+MODEL:
+    NAME: 'SETR_Naive'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [9, 14, 19, 23]   
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    DECODER_TYPE: 'Naive_VisionTransformerUpHead'
+    PUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 1
+       CONV3x3_CONV1x1: False
+       ALIGN_CORNERS: False
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    AUXPUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 1
+       CONV3x3_CONV1x1: False
+       ALIGN_CORNERS: False
+    AUX:
+       AUXIHEAD: True
+TRAIN:
+    BASE_LR: 0.01
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 40000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 40000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    IMAGE_BASE_SIZE: 1024
+    CROP_SIZE: [768,768]
+    STRIDE_SIZE: [512,512]
+SAVE_DIR: "./output/SETR_Naive_Large_768x768_40k_cityscapes_bs_8"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_Naive_Large_768x768_80k_cityscapes_bs_8.yaml b/semantic_segmentation/configs/setr/SETR_Naive_Large_768x768_80k_cityscapes_bs_8.yaml
new file mode 100644
index 00000000..cac07aaa
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_Naive_Large_768x768_80k_cityscapes_bs_8.yaml
@@ -0,0 +1,59 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'Cityscapes' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/cityscapes'
+    CROP_SIZE: (768,768)  # input_size (training)
+    NUM_CLASSES: 19
+MODEL:
+    NAME: 'SETR_Naive'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [9, 14, 19, 23]   
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    DECODER_TYPE: 'Naive_VisionTransformerUpHead'
+    PUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 1
+       CONV3x3_CONV1x1: False
+       ALIGN_CORNERS: False
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    AUXPUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 1
+       CONV3x3_CONV1x1: False
+       ALIGN_CORNERS: False
+    AUX:
+       AUXIHEAD: True
+TRAIN:
+    BASE_LR: 0.01
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 80000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 80000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    IMAGE_BASE_SIZE: 1024
+    CROP_SIZE: [768,768]
+    STRIDE_SIZE: [512,512]
+SAVE_DIR: "./output/SETR_Naive_Large_768x768_80k_cityscapes_bs_8"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_PUP_Large_480x480_80k_pascal_context_bs_16.yaml b/semantic_segmentation/configs/setr/SETR_PUP_Large_480x480_80k_pascal_context_bs_16.yaml
new file mode 100644
index 00000000..c93b19a2
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_PUP_Large_480x480_80k_pascal_context_bs_16.yaml
@@ -0,0 +1,59 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'PascalContext' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/pascal_context'
+    CROP_SIZE: (480,480)
+    NUM_CLASSES: 60
+MODEL:
+    NAME: 'SETR_PUP'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [9, 14, 19, 23]   
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    DECODER_TYPE: 'PUP_VisionTransformerUpHead'
+    PUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 4
+       NUM_UPSAMPLE_LAYER: 4
+       CONV3x3_CONV1x1: True
+       ALIGN_CORNERS: False
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    AUXPUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 2
+       CONV3x3_CONV1x1: True
+       ALIGN_CORNERS: False
+    AUX:
+       AUXIHEAD: True
+TRAIN:
+    BASE_LR: 0.001
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 80000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 80000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    IMAGE_BASE_SIZE: 520
+    CROP_SIZE: [480,480]
+    STRIDE_SIZE: [320,320]
+SAVE_DIR: "./output/SETR_PUP_Large_480x480_80k_pascal_context_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_PUP_Large_512x512_160k_ade20k_bs_16.yaml b/semantic_segmentation/configs/setr/SETR_PUP_Large_512x512_160k_ade20k_bs_16.yaml
new file mode 100644
index 00000000..a910641f
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_PUP_Large_512x512_160k_ade20k_bs_16.yaml
@@ -0,0 +1,59 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/ADEChallengeData2016'
+    CROP_SIZE: (512,512)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'SETR_PUP'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [9, 14, 19, 23]   
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    DECODER_TYPE: 'PUP_VisionTransformerUpHead'
+    PUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 4
+       NUM_UPSAMPLE_LAYER: 4
+       CONV3x3_CONV1x1: True
+       ALIGN_CORNERS: False
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    AUXPUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 2
+       CONV3x3_CONV1x1: True
+       ALIGN_CORNERS: False
+    AUX:
+       AUXIHEAD: True
+TRAIN:
+    BASE_LR: 0.01
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25]
+    IMAGE_BASE_SIZE: 576
+    CROP_SIZE: [512,512]
+    STRIDE_SIZE: [341,341]
+SAVE_DIR: "./output/SETR_PUP_Large_512x512_160k_ade20k_bs_16"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_PUP_Large_768x768_40k_cityscapes_bs_8.yaml b/semantic_segmentation/configs/setr/SETR_PUP_Large_768x768_40k_cityscapes_bs_8.yaml
new file mode 100644
index 00000000..8c684060
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_PUP_Large_768x768_40k_cityscapes_bs_8.yaml
@@ -0,0 +1,59 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'Cityscapes' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/cityscapes'
+    CROP_SIZE: (768,768)
+    NUM_CLASSES: 19
+MODEL:
+    NAME: 'SETR_PUP'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [9, 14, 19, 23]   
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    DECODER_TYPE: 'PUP_VisionTransformerUpHead'
+    PUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 4
+       NUM_UPSAMPLE_LAYER: 4
+       CONV3x3_CONV1x1: True
+       ALIGN_CORNERS: False
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    AUXPUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 2
+       CONV3x3_CONV1x1: True
+       ALIGN_CORNERS: False
+    AUX:
+       AUXIHEAD: True
+TRAIN:
+    BASE_LR: 0.01
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 40000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 40000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    IMAGE_BASE_SIZE: 1024
+    CROP_SIZE: [768,768]
+    STRIDE_SIZE: [512,512]
+SAVE_DIR: "./output/SETR_PUP_Large_768x768_40k_cityscapes_bs_8"
+
+        
+
diff --git a/semantic_segmentation/configs/setr/SETR_PUP_Large_768x768_80k_cityscapes_bs_8.yaml b/semantic_segmentation/configs/setr/SETR_PUP_Large_768x768_80k_cityscapes_bs_8.yaml
new file mode 100644
index 00000000..f6d5d83c
--- /dev/null
+++ b/semantic_segmentation/configs/setr/SETR_PUP_Large_768x768_80k_cityscapes_bs_8.yaml
@@ -0,0 +1,59 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'Cityscapes' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/cityscapes'
+    CROP_SIZE: (768,768)
+    NUM_CLASSES: 19
+MODEL:
+    NAME: 'SETR_PUP'
+    ENCODER:
+        TYPE: 'ViT'
+        OUT_INDICES: [9, 14, 19, 23]   
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    DECODER_TYPE: 'PUP_VisionTransformerUpHead'
+    PUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 4
+       NUM_UPSAMPLE_LAYER: 4
+       CONV3x3_CONV1x1: True
+       ALIGN_CORNERS: False
+    TRANS:
+        PATCH_SIZE: 16
+        HIDDEN_SIZE: 1024 # 768(Base), 1024(Large), 1280(Huge)
+        MLP_RATIO: 4     # same as mlp_ratio = 4.0
+        NUM_HEADS: 16    # 12(Base), 16(Large), 16(Huge)
+        NUM_LAYERS: 24   # 12(Base), 24(Large), 32(Huge)
+        QKV_BIAS: True
+    AUXPUP:
+       INPUT_CHANNEL: 1024
+       NUM_CONV: 2
+       NUM_UPSAMPLE_LAYER: 2
+       CONV3x3_CONV1x1: True
+       ALIGN_CORNERS: False
+    AUX:
+       AUXIHEAD: True
+TRAIN:
+    BASE_LR: 0.01
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 80000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 80000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    IMAGE_BASE_SIZE: 1024
+    CROP_SIZE: [768,768]
+    STRIDE_SIZE: [512,512]
+SAVE_DIR: "./output/SETR_PUP_Large_768x768_80k_cityscapes_bs_8"
+
+        
+
diff --git a/semantic_segmentation/configs/trans2seg/README.md b/semantic_segmentation/configs/trans2seg/README.md
new file mode 100644
index 00000000..172be561
--- /dev/null
+++ b/semantic_segmentation/configs/trans2seg/README.md
@@ -0,0 +1,23 @@
+# Segmenting Transparent Object in the Wild with Transformer, [arxiv](https://arxiv.org/pdf/2101.08461.pdf)
+
+The official pytorch implementation is [here](https://github.com/xieenze/Trans2Seg)
+## Framework
+<img src="../../figure/trans2seg_framework.png" alt="drawing" width="100%" height="100%"/>
+
+## Model Zoo ##
+
+### Trans10kV2 ###
+|Model      | Backbone  | Batch_size | Iteration | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint     |     ConfigFile  |
+|-----------|-----------|------------|-----------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+|Trans2seg_Medium | Resnet50c |     16      |    80k    |  72.25  |      -        |   [google](https://drive.google.com/file/d/1C6nMg6DgQ73wzF21UwDVxmkcRTeKngnK/view?usp=sharing)/[baidu](https://pan.baidu.com/s/1hs0tbSGIeMLLGMq05NN--w)(4dd5)    | [google](https://drive.google.com/file/d/1zGEBEN27CQMgZBYqqAg_agJE6CPLOpYW/view?usp=sharing)/[baidu](https://pan.baidu.com/s/102GUBeoEPMqMEqF3smgyCA)(qcb0)   | [config](semantic_segmentation/configs/trans2seg/Trans2Seg_medium_512x512_80k_trans10kv2_bs_16.yaml)| 
+
+## Reference
+```
+@article{xie2021segmenting,
+  title={Segmenting transparent object in the wild with transformer},
+  author={Xie, Enze and Wang, Wenjia and Wang, Wenhai and Sun, Peize and Xu, Hang and Liang, Ding and Luo, Ping},
+  journal={arXiv preprint arXiv:2101.08461},
+  year={2021}
+}
+
+```
diff --git a/semantic_segmentation/configs/trans2seg/Trans2Seg_medium_512x512_80k_trans10kv2_bs_16.yaml b/semantic_segmentation/configs/trans2seg/Trans2Seg_medium_512x512_80k_trans10kv2_bs_16.yaml
new file mode 100644
index 00000000..f4b1d7f3
--- /dev/null
+++ b/semantic_segmentation/configs/trans2seg/Trans2Seg_medium_512x512_80k_trans10kv2_bs_16.yaml
@@ -0,0 +1,31 @@
+DATA:
+    DATASET: "Trans10kV2"
+    BATCH_SIZE: 16
+    BATCH_SIZE_VAL: 1
+    DATA_PATH: 'E:/Trans10K_cls12'
+    CROP_SIZE: (512, 512)
+    NUM_CLASSES: 12
+TRAIN:
+    BASE_LR: 0.0001
+    ITERS: 80000
+    LR_SCHEDULER:
+        NAME: "PolynomialDecay"
+    OPTIMIZER:
+        NAME: 'ADAM'
+VAL:
+    MULTI_SCALES_VAL: False
+    IMAGE_BASE_SIZE: 512
+    CROP_SIZE: [512, 512]
+MODEL:
+    NAME: "Trans2Seg"
+    ENCODER:
+        TYPE: "resnet50c"
+        MULTI_GRID: 
+        MULTI_DILATION: 
+    TRANS2SEG:
+        EMBED_DIM: 256
+        DEPTH: 4
+        NUM_HEADS: 8
+        MLP_RATIO: 3.
+        HID_DIM: 64
+SAVE_DIR: "./output/trans10kv2/Trans2Seg_medium_512x512_80k_trans10kv2_bs_16"
\ No newline at end of file
diff --git a/semantic_segmentation/configs/upernet_swin/README.md b/semantic_segmentation/configs/upernet_swin/README.md
new file mode 100644
index 00000000..71122bf1
--- /dev/null
+++ b/semantic_segmentation/configs/upernet_swin/README.md
@@ -0,0 +1,30 @@
+# Swin Transformer: Hierarchical Vision Transformer using Shifted Windows, [arxiv](https://arxiv.org/pdf/2103.14030.pdf)
+
+The official pytorch implementation is [here](https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation)
+## Framework
+<img src="../../figure/upernet_swin_framework.png" alt="drawing" width="100%" height="100%"/>
+
+## Model Zoo ##
+### ADE20K ###
+|Model      | Backbone  | Batch_size | Iteration | mIoU (ss) | mIoU (ms+flip) | Backbone_checkpoint | Model_checkpoint     |     ConfigFile  |
+|-----------|-----------|------------|-----------|-----------|----------------|-----------------------------------------------|-----------------------------------------------------------------------|------------|
+| UperNet  | Swin_Tiny |     16     |     160k   |  44.90   |       45.37     |   -      |[baidu](https://pan.baidu.com/s/1S8JR4ILw0u4I-DzU4MaeVQ)(lkhg)   |  [config](semantic_segmentation/configs/upernet_swin/upernet_swin_tiny_patch4_windown7_512x512_160k_ade20k.yaml) |
+| UperNet  | Swin_Small |     16     |     160k   |  47.88   |       48.90      |   -      |[baidu](https://pan.baidu.com/s/17RKeSpuWqONVptQZ3B4kEA)(vvy1)   |  [config](semantic_segmentation/configs/upernet_swin/upernet_swin_small_patch4_windown7_512x512_160k_ade20k.yaml) |
+| UperNet  | Swin_Base |     16     |     160k   |   48.59   |       49.04      |   -      |[baidu](https://pan.baidu.com/s/1bM15KHNsb0oSPblQwhxbgw)(y040)   |  [config](semantic_segmentation/configs/upernet_swin/upernet_swin_base_patch4_windown7_512x512_160k_ade20k.yaml) |
+
+## Reference
+```
+@article{liu2021swin,
+  title={Swin transformer: Hierarchical vision transformer using shifted windows},
+  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
+  journal={arXiv preprint arXiv:2103.14030},
+  year={2021}
+}
+@inproceedings{xiao2018unified,
+  title={Unified perceptual parsing for scene understanding},
+  author={Xiao, Tete and Liu, Yingcheng and Zhou, Bolei and Jiang, Yuning and Sun, Jian},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  pages={418--434},
+  year={2018}
+}
+```
diff --git a/semantic_segmentation/configs/upernet_swin/upernet_swin_base_patch4_windown7_512x512_160k_ade20k.yaml b/semantic_segmentation/configs/upernet_swin/upernet_swin_base_patch4_windown7_512x512_160k_ade20k.yaml
new file mode 100644
index 00000000..9558b060
--- /dev/null
+++ b/semantic_segmentation/configs/upernet_swin/upernet_swin_base_patch4_windown7_512x512_160k_ade20k.yaml
@@ -0,0 +1,66 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'UperNet_Swin'
+    ENCODER:
+        TYPE: 'SwinTransformer'
+        OUT_INDICES: [0, 1, 2, 3]   # stage_i
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    DECODER_TYPE: 'UperHead'
+    UPERHEAD:
+        IN_CHANNELS: [128, 256, 512, 1024]
+        IN_INDEX: [0, 1, 2, 3]
+        POOL_SCALES: [1, 2, 3, 6]
+        CHANNELS: 512
+        DROP_RATIO: 0.1
+        ALIGN_CORNERS: False
+    TRANS:
+        PATCH_SIZE: 4
+        WINDOW_SIZE: 7
+        IN_CHANNELS: 3
+        HIDDEN_SIZE: 128  
+        EMBED_DIM: 128
+        STAGE_DEPTHS: [2, 2, 18, 2]
+        NUM_HEADS: [4, 8, 16, 32]    
+        MLP_RATIO: 4
+        QKV_BIAS: True
+        QK_SCALE: None
+        APE: False  # absolute positional embeddings
+        PATCH_NORM: True
+    AUX:
+        AUXIHEAD: True
+    AUXFCN:
+        IN_CHANNELS: 512
+        UP_RATIO: 16
+
+TRAIN:
+    BASE_LR: 0.00006
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0]
+    IMAGE_BASE_SIZE: 576
+    KEEP_ORI_SIZE: False
+    RESCALE_FROM_ORI: False
+    CROP_SIZE: [512,512]
+    STRIDE_SIZE: [341,341]
+SAVE_DIR: "./output/UperNet_swin_base_patch4_windown7_512x512_160k_ade20k"
+
+        
+
diff --git a/semantic_segmentation/configs/upernet_swin/upernet_swin_small_patch4_windown7_512x512_160k_ade20k.yaml b/semantic_segmentation/configs/upernet_swin/upernet_swin_small_patch4_windown7_512x512_160k_ade20k.yaml
new file mode 100644
index 00000000..8fe22c9d
--- /dev/null
+++ b/semantic_segmentation/configs/upernet_swin/upernet_swin_small_patch4_windown7_512x512_160k_ade20k.yaml
@@ -0,0 +1,64 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'UperNet_Swin'
+    ENCODER:
+        TYPE: 'SwinTransformer'
+        OUT_INDICES: [0, 1, 2, 3]   # stage_i
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    DECODER_TYPE: 'UperHead'
+    UPERHEAD:
+        IN_CHANNELS: [96, 192, 384, 768]
+        IN_INDEX: [0, 1, 2, 3]
+        POOL_SCALES: [1, 2, 3, 6]
+        CHANNELS: 512
+        DROP_RATIO: 0.1
+        ALIGN_CORNERS: False
+    TRANS:
+        PATCH_SIZE: 4
+        WINDOW_SIZE: 7
+        IN_CHANNELS: 3
+        HIDDEN_SIZE: 96 # 768(Base), 1024(Large), 1280(Huge)
+        EMBED_DIM: 96
+        STAGE_DEPTHS: [2, 2, 18, 2]
+        NUM_HEADS: [3, 6, 12, 24]    
+        MLP_RATIO: 4
+        QKV_BIAS: True
+        QK_SCALE: None
+        APE: False  # absolute positional embeddings
+        PATCH_NORM: True
+    AUX:
+        AUXIHEAD: True
+    AUXFCN:
+        IN_CHANNELS: 384
+        UP_RATIO: 16
+
+TRAIN:
+    BASE_LR: 0.00006
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0]
+    IMAGE_BASE_SIZE: 576
+    CROP_SIZE: [512,512]
+    STRIDE_SIZE: [341,341]
+SAVE_DIR: "./output/UperNet_swin_small_patch4_windown7_512x512_160k_ade20k"
+
+        
+
diff --git a/semantic_segmentation/configs/upernet_swin/upernet_swin_tiny_patch4_windown7_512x512_160k_ade20k.yaml b/semantic_segmentation/configs/upernet_swin/upernet_swin_tiny_patch4_windown7_512x512_160k_ade20k.yaml
new file mode 100644
index 00000000..5097418a
--- /dev/null
+++ b/semantic_segmentation/configs/upernet_swin/upernet_swin_tiny_patch4_windown7_512x512_160k_ade20k.yaml
@@ -0,0 +1,64 @@
+DATA:
+    BATCH_SIZE: 2  # per GPU [total bs is set to 8 or 16]
+    BATCH_SIZE_VAL: 1  # per GPU
+    DATASET: 'ADE20K' # dataset name
+    DATA_PATH: '/home/ssd3/wutianyi/datasets/ADEChallengeData2016'
+    CROP_SIZE: (512,512)  # input_size (training)
+    NUM_CLASSES: 150
+MODEL:
+    NAME: 'UperNet_Swin'
+    ENCODER:
+        TYPE: 'SwinTransformer'
+        OUT_INDICES: [0, 1, 2, 3]   # stage_i
+    PRETRAINED: './pretrain_models/backbones/vit_large_patch16_224.pdparams'
+    DECODER_TYPE: 'UperHead'
+    UPERHEAD:
+        IN_CHANNELS: [96, 192, 384, 768]
+        IN_INDEX: [0, 1, 2, 3]
+        POOL_SCALES: [1, 2, 3, 6]
+        CHANNELS: 512
+        DROP_RATIO: 0.1
+        ALIGN_CORNERS: False
+    TRANS:
+        PATCH_SIZE: 4
+        WINDOW_SIZE: 7
+        IN_CHANNELS: 3
+        HIDDEN_SIZE: 96 # 768(Base), 1024(Large), 1280(Huge)
+        EMBED_DIM: 96
+        STAGE_DEPTHS: [2, 2, 6, 2]
+        NUM_HEADS: [3, 6, 12, 24]    
+        MLP_RATIO: 4
+        QKV_BIAS: True
+        QK_SCALE: None
+        APE: False  # absolute positional embeddings
+        PATCH_NORM: True
+    AUX:
+        AUXIHEAD: True
+    AUXFCN:
+        IN_CHANNELS: 384
+        UP_RATIO: 16
+
+TRAIN:
+    BASE_LR: 0.00006
+    END_LR: 1e-4
+    DECODER_LR_COEF: 10.0
+    GRAD_CLIP: 1.0
+    ITERS: 160000
+    WEIGHT_DECAY: 0.0
+    POWER: 0.9
+    DECAY_STEPS: 160000
+    LR_SCHEDULER:
+        NAME: 'PolynomialDecay'
+    OPTIMIZER:
+        NAME: 'SGD'
+        MOMENTUM: 0.9
+VAL:
+    MULTI_SCALES_VAL: False
+    SCALE_RATIOS: [0.5, 0.75, 1.0]
+    IMAGE_BASE_SIZE: 576
+    CROP_SIZE: [512,512]
+    STRIDE_SIZE: [341,341]
+SAVE_DIR: "./output/UperNet_swin_tiny_patch4_windown7_512x512_160k_ade20k"
+
+        
+
diff --git a/semantic_segmentation/demo/demo.py b/semantic_segmentation/demo/demo.py
new file mode 100644
index 00000000..6798b351
--- /dev/null
+++ b/semantic_segmentation/demo/demo.py
@@ -0,0 +1,119 @@
+#!/usr/bin/python3
+import os
+import time
+import shutil
+import random
+import argparse
+import numpy as np
+import cv2
+from PIL import Image as PILImage
+import shutil
+import paddle
+import paddle.nn.functional as F
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+from config import *
+from src.api import infer
+from src.transforms import Compose, Resize, Normalize 
+from src.models import get_model
+from src.utils import logger, progbar
+from src.utils import load_entire_model, resume
+from src.utils.vis import visualize
+from src.utils.vis import get_cityscapes_color_map
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="PaddleViT-Seg Demo")
+    parser.add_argument(
+        "--config",
+        dest='cfg',
+        default="../configs/setr/SETR_PUP_Large_768x768_80k_cityscapes_bs_8.yaml", 
+        type=str, 
+        help="the config file."
+    )
+    parser.add_argument(
+        "--model_path", 
+        default="../pretrain_models/setr/SETR_PUP_cityscapes_b8_80k.pdparams", 
+        type=str,
+        help="the path of weights file (segmentation model)"
+    )
+    parser.add_argument(
+        "--pretrained_backbone",
+        default="../pretrain_models/backbones/vit_large_patch16_224.pdparams",
+        type=str,
+        help="the path of weights file (backbone)"
+    )
+    parser.add_argument(                                                                                                                                                                                            
+        "--img_dir", 
+        default="./img/",
+        type=str,
+        help="the directory of input images"
+    )
+    parser.add_argument(
+        "--results_dir", 
+        default="./results/",
+        type=str,
+        help="the directory of segmentation results"
+    )
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    config = get_config()
+    args = parse_args()
+    config = update_config(config, args)
+    place = 'gpu' if config.VAL.USE_GPU else 'cpu'
+    paddle.set_device(place)
+    # build model
+    model = get_model(config)
+    if args.model_path:
+        load_entire_model(model, args.model_path)
+        logger.info('Loaded trained params of model successfully')
+    model.eval()
+
+    if os.path.exists(args.results_dir):
+        shutil.rmtree(args.results_dir)
+    os.makedirs(args.results_dir)
+
+    nranks = paddle.distributed.ParallelEnv().nranks
+    local_rank = paddle.distributed.ParallelEnv().local_rank
+    # build transforms for input images
+    transforms_val = [ Resize(target_size=config.VAL.IMAGE_BASE_SIZE,
+                              keep_ori_size=config.VAL.KEEP_ORI_SIZE),
+                       Normalize(mean=config.VAL.MEAN, std=config.VAL.STD)]
+    transforms_val = Compose(transforms_val)
+    logger.info("Start predicting: ")
+    img_files = os.listdir(args.img_dir)
+    img_files = [ os.path.join(args.img_dir, item) for item in img_files ]
+    print("img_files: ", img_files)
+    progbar_val = progbar.Progbar(target=len(img_files), verbose=1)
+    with paddle.no_grad():
+        for i, img_path in enumerate(img_files):
+            img = cv2.imread(img_path)
+            ori_shape = img.shape[:2]
+            img, _ = transforms_val(img)
+            img = img[np.newaxis, ...]
+            img = paddle.to_tensor(img)
+            pred = infer.ss_inference(
+                model=model,
+                img=img,
+                ori_shape=ori_shape,
+                is_slide=True,
+                base_size=config.VAL.IMAGE_BASE_SIZE,
+                stride_size=config.VAL.STRIDE_SIZE,
+                crop_size=config.VAL.CROP_SIZE,
+                num_classes=config.DATA.NUM_CLASSES,
+                rescale_from_ori=config.VAL.RESCALE_FROM_ORI)
+            pred = paddle.squeeze(pred)
+            pred = pred.numpy().astype('uint8')
+            img_name = os.path.basename(img_path)
+            # save image+mask
+            mask_added_image = visualize(img_path, pred, weight=0.6)
+            mask_added_image_path = os.path.join(args.results_dir, img_name)
+            cv2.imwrite(mask_added_image_path, mask_added_image)
+            # saving color mask
+            pred_mask = PILImage.fromarray(pred.astype(np.uint8), mode='P')
+            color_map = get_cityscapes_color_map()
+            pred_mask.putpalette(color_map)
+            pred_saved_path = os.path.join(args.results_dir, 
+                img_name.rsplit(".")[0] + "_color.png")
+            pred_mask.save(pred_saved_path)
+            progbar_val.update(i + 1)           
diff --git a/semantic_segmentation/demo/img/dusseldorf_000088_000019_leftImg8bit.png b/semantic_segmentation/demo/img/dusseldorf_000088_000019_leftImg8bit.png
new file mode 100755
index 00000000..6bdd3e69
Binary files /dev/null and b/semantic_segmentation/demo/img/dusseldorf_000088_000019_leftImg8bit.png differ
diff --git a/semantic_segmentation/demo/img/frankfurt_000001_058914_leftImg8bit.png b/semantic_segmentation/demo/img/frankfurt_000001_058914_leftImg8bit.png
new file mode 100755
index 00000000..475713d6
Binary files /dev/null and b/semantic_segmentation/demo/img/frankfurt_000001_058914_leftImg8bit.png differ
diff --git a/semantic_segmentation/demo/img/hanover_000000_044195_leftImg8bit.png b/semantic_segmentation/demo/img/hanover_000000_044195_leftImg8bit.png
new file mode 100755
index 00000000..9ca3d3c4
Binary files /dev/null and b/semantic_segmentation/demo/img/hanover_000000_044195_leftImg8bit.png differ
diff --git a/semantic_segmentation/demo/img/lindau_000023_000019_leftImg8bit.png b/semantic_segmentation/demo/img/lindau_000023_000019_leftImg8bit.png
new file mode 100755
index 00000000..b5fc5201
Binary files /dev/null and b/semantic_segmentation/demo/img/lindau_000023_000019_leftImg8bit.png differ
diff --git a/semantic_segmentation/demo/img/zurich_000038_000019_leftImg8bit.png b/semantic_segmentation/demo/img/zurich_000038_000019_leftImg8bit.png
new file mode 100755
index 00000000..230d7594
Binary files /dev/null and b/semantic_segmentation/demo/img/zurich_000038_000019_leftImg8bit.png differ
diff --git a/semantic_segmentation/figure/dpt_framework.png b/semantic_segmentation/figure/dpt_framework.png
new file mode 100644
index 00000000..832db14b
Binary files /dev/null and b/semantic_segmentation/figure/dpt_framework.png differ
diff --git a/semantic_segmentation/figure/ppvit_seg.png b/semantic_segmentation/figure/ppvit_seg.png
new file mode 100644
index 00000000..4c244640
Binary files /dev/null and b/semantic_segmentation/figure/ppvit_seg.png differ
diff --git a/semantic_segmentation/figure/segformer_framework.png b/semantic_segmentation/figure/segformer_framework.png
new file mode 100644
index 00000000..b3e05130
Binary files /dev/null and b/semantic_segmentation/figure/segformer_framework.png differ
diff --git a/semantic_segmentation/figure/segmenter_framework.png b/semantic_segmentation/figure/segmenter_framework.png
new file mode 100644
index 00000000..46141831
Binary files /dev/null and b/semantic_segmentation/figure/segmenter_framework.png differ
diff --git a/semantic_segmentation/figure/setr_framework.png b/semantic_segmentation/figure/setr_framework.png
new file mode 100644
index 00000000..9dc5a382
Binary files /dev/null and b/semantic_segmentation/figure/setr_framework.png differ
diff --git a/semantic_segmentation/figure/trans2seg_framework.png b/semantic_segmentation/figure/trans2seg_framework.png
new file mode 100644
index 00000000..a8ec32df
Binary files /dev/null and b/semantic_segmentation/figure/trans2seg_framework.png differ
diff --git a/semantic_segmentation/figure/upernet_swin_framework.png b/semantic_segmentation/figure/upernet_swin_framework.png
new file mode 100644
index 00000000..b8fee460
Binary files /dev/null and b/semantic_segmentation/figure/upernet_swin_framework.png differ
diff --git a/semantic_segmentation/pretrain_models/backbones/note.md b/semantic_segmentation/pretrain_models/backbones/note.md
new file mode 100644
index 00000000..9eaa0cda
--- /dev/null
+++ b/semantic_segmentation/pretrain_models/backbones/note.md
@@ -0,0 +1,3 @@
+1. the weight files of backbone networks  should be placed in this directory.
+
+For example: vit_large_patch16_224.pdparams
diff --git a/semantic_segmentation/pretrain_models/setr/note.md b/semantic_segmentation/pretrain_models/setr/note.md
new file mode 100644
index 00000000..23e28005
--- /dev/null
+++ b/semantic_segmentation/pretrain_models/setr/note.md
@@ -0,0 +1,3 @@
+1. the weight files of segmentation models should be placed in this directory.
+
+For example: SETR_MLA_pascal_context_b8_80k.pdparams
diff --git a/semantic_segmentation/requirements.txt b/semantic_segmentation/requirements.txt
new file mode 100644
index 00000000..8f5fba8c
--- /dev/null
+++ b/semantic_segmentation/requirements.txt
@@ -0,0 +1,6 @@
+cityscapesScripts==2.2.0
+detail==4.0
+numpy==1.20.3
+opencv-python==4.5.2.52
+scipy==1.6.3
+yacs==0.1.8
diff --git a/semantic_segmentation/run_local.sh b/semantic_segmentation/run_local.sh
new file mode 100755
index 00000000..0b4cccd9
--- /dev/null
+++ b/semantic_segmentation/run_local.sh
@@ -0,0 +1,18 @@
+
+
+# training
+## single-gpu
+#CUDA_VISIBLE_DEVICES=5 python3  train.py --config ./configs/SETR/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml
+
+## multi-gpu
+#CUDA_VISIBLE_DEVICES=2,4,7 python3 -u -m paddle.distributed.launch train.py --config ./configs/SETR/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml
+
+# testing
+## single-gpu
+#CUDA_VISIBLE_DEVICES=3 python3  val.py  --config ./configs/SETR/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml \
+#    --model_path ./pretrain_models/setr/SETR_MLA_pascal_context_b8_80k.pdparams
+
+## multi-gpu
+CUDA_VISIBLE_DEVICES=2,3,4,5,7 python3 -u -m paddle.distributed.launch val.py  --config ./configs/SETR/SETR_MLA_Large_480x480_80k_pascal_context_bs_8.yaml \
+    --model_path ./pretrain_models/setr/SETR_MLA_pascal_context_b8_80k.pdparams
+
diff --git a/semantic_segmentation/src/__init__.py b/semantic_segmentation/src/__init__.py
new file mode 100644
index 00000000..ce252b2e
--- /dev/null
+++ b/semantic_segmentation/src/__init__.py
@@ -0,0 +1,2 @@
+from . import models, datasets, transforms
+
diff --git a/semantic_segmentation/src/api/__init__.py b/semantic_segmentation/src/api/__init__.py
new file mode 100644
index 00000000..430812f3
--- /dev/null
+++ b/semantic_segmentation/src/api/__init__.py
@@ -0,0 +1,3 @@
+from . import infer
+
+__all__ = ['infer']
diff --git a/semantic_segmentation/src/api/infer.py b/semantic_segmentation/src/api/infer.py
new file mode 100644
index 00000000..99415b89
--- /dev/null
+++ b/semantic_segmentation/src/api/infer.py
@@ -0,0 +1,212 @@
+import numpy as np
+import math
+import cv2
+import collections.abc
+import paddle
+import paddle.nn.functional as F
+
+def slide_inference(model, img, crop_size, stride_size, num_classes):
+    """
+    Inference by sliding-window with overlap, the overlap is equal to stride.
+
+    Args:
+        model (paddle.nn.Layer): model to get logits of image.
+        im (Tensor): the input image.
+        crop_size (tuple|list): the size of sliding window, (w, h).
+        stride_size (tuple|list): the size of stride, (w, h).
+        num_classes (int): the number of classes
+
+    Return:
+        final_logit (Tensor): The logit of input image, whose size is equal to 
+        the size of img (not the orginal size).
+    """
+    h_img, w_img = img.shape[-2:]
+    w_crop, h_crop = crop_size
+    w_stride, h_stride = stride_size
+    # calculate the crop nums
+    rows = max(h_img - h_crop + h_stride -1, 0) // h_stride + 1
+    cols = max(w_img - w_crop + w_stride -1, 0) // w_stride + 1
+    count = np.zeros([1, 1, h_img, w_img])
+    final_logit = paddle.zeros([1, num_classes, h_img, w_img], dtype='float32')
+    for r in range(rows):
+        for c in range(cols):
+            h1 = r * h_stride
+            w1 = c * w_stride
+            h2 = min(h1 + h_crop, h_img)
+            w2 = min(w1 + w_crop, w_img)
+            h1 = max(h2 - h_crop, 0)
+            w1 = max(w2 - w_crop, 0)
+            img_crop = img[:, :, h1:h2, w1:w2]
+            logits = model(img_crop)
+            logit = logits[0]
+            final_logit += F.pad(logit, [w1, w_img - w2, h1, h_img - h2])
+            count[:, :, h1:h2, w1:w2] += 1
+    final_logit = final_logit.numpy() / count
+    final_logit = paddle.to_tensor(final_logit)
+    return final_logit
+
+
+def ss_inference(model,
+                 img, 
+                 ori_shape, 
+                 is_slide, 
+                 base_size, 
+                 stride_size, 
+                 crop_size, 
+                 num_classes, 
+                 rescale_from_ori=False):
+    """
+    Single-scale inference for image.
+
+    Args:
+        model (paddle.nn.Layer): model to get logits of image.
+        img (Tensor): the input image.
+        ori_shape (list): origin shape of image.
+        is_slide (bool): whether to infer by sliding window.
+        base_size (list): the size of short edge is resize to min(base_size) 
+        when it is smaller than min(base_size)  
+        stride_size (tuple|list): the size of stride, (w, h). It should be 
+        probided if is_slide is True.
+        crop_size (tuple|list). the size of sliding window, (w, h). It should 
+        be probided if is_slide is True.
+        num_classes (int): the number of classes
+        rescale_from_ori (bool): whether rescale image from the original size. 
+        Default: False.
+
+    Returns:
+        pred (tensor): If ori_shape is not None, a prediction with shape (1, 1, h, w) 
+        is returned. If ori_shape is None, a logit with shape (1, num_classes, 
+        h, w) is returned.
+    """
+    if not is_slide:
+        logits = model(img)
+        if not isinstance(logits, collections.abc.Sequence):
+            raise TypeError("The type of logits must be one of "
+                "collections.abc.Sequence, e.g. list, tuple. But received {}"
+                .format(type(logits)))
+        logit = logits[0]
+    else:
+        # TODO (wutianyiRosun@gmail.com): when dataloader does not uses resize,
+        #  rescale or padding
+        if rescale_from_ori:
+            h, w = img.shape[-2], img.shape[-1]
+            if min(h,w) < min(base_size):
+                new_short = min(base_size)
+                if h > w :
+                    new_h, new_w = int(new_short * h / w), new_short
+                else:
+                    new_h, new_w = new_short, int(new_short * w / h)
+                h, w = new_h, new_w
+                img = F.interpolate(img, (h, w), mode='bilinear')
+                #print("rescale, img.shape: ({}, {})".format(h,w))
+        logit = slide_inference(model, img, crop_size, stride_size, num_classes)
+
+    if ori_shape is not None:
+        # resize to original shape
+        logit = F.interpolate(logit, ori_shape, mode='bilinear', align_corners=False)  
+        logit = F.softmax(logit, axis=1)
+        pred = paddle.argmax(logit, axis=1, keepdim=True, dtype='int32')
+        return pred
+    else:
+        return logit
+
+
+def ms_inference(model,
+                 img,
+                 ori_shape,
+                 is_slide,
+                 base_size,
+                 stride_size,
+                 crop_size,
+                 num_classes, 
+                 scales=[1.0,],
+                 flip_horizontal=True, 
+                 flip_vertical=False,
+                 rescale_from_ori=False):
+
+    """
+    Multi-scale inference.
+
+    For each scale, the segmentation result is first generated by sliding-window
+    testing with overlap. Then the segmentation result is resize to the original 
+    size, followed by softmax operation. Finally, the segmenation logits of all 
+    scales are averaged (+argmax) 
+
+    Args:
+        model (paddle.nn.Layer): model to get logits of image.
+        img (Tensor): the input image.
+        ori_shape (list): origin shape of image.
+        is_slide (bool): whether to infer by sliding wimdow. 
+        base_size (list): the size of short edge is resize to min(base_size) 
+        when it is smaller than min(base_size)  
+        crop_size (tuple|list). the size of sliding window, (w, h). It should
+        be probided if is_slide is True.
+        stride_size (tuple|list). the size of stride, (w, h). It should be 
+        probided if is_slide is True.
+        num_classes (int): the number of classes
+        scales (list):  scales for resize. Default: [1.0,].
+        flip_horizontal (bool): whether to flip horizontally. Default: True
+        flip_vertical (bool): whether to flip vertically. Default: False.
+        rescale_from_ori (bool): whether rescale image from the original size. Default: False.
+
+    Returns:
+        Pred (tensor): Prediction of image with shape (1, 1, h, w) is returned.
+    """
+    if not isinstance(scales, (tuple, list)):
+        raise('`scales` expects tuple/list, but received {}'.format(type(scales)))
+    final_logit = 0
+    if rescale_from_ori:
+        if not isinstance(base_size, tuple):
+            raise('base_size is not a tuple, but received {}'.format(type(tupel)))
+        h_input, w_input = base_size
+    else:
+        h_input, w_input = img.shape[-2], img.shape[-1]
+    for scale in scales:
+        h = int(h_input * scale + 0.5)
+        w = int(w_input * scale + 0.5)
+        if rescale_from_ori:
+            # TODO (wutianyiRosun@gmail.com): whole image testing, rescale 
+            # original image according to the scale_factor between the 
+            # origianl size and scale
+            # scale_factor := min ( max(scale) / max(ori_size), min(scale) / min(ori_size) ) 
+            h_ori, w_ori = img.shape[-2], img.shape[-1]
+            max_long_edge = max(h, w)
+            max_short_edge = min(h, w)
+            scale_factor = min(max_long_edge / max(h, w),
+                               max_short_edge / min(h, w))
+            # compute new size
+            new_h = int(h_ori * float(scale_factor) + 0.5)
+            new_w = int(w_ori * float(scale_factor) + 0.5)
+            h, w = new_h, new_w
+            img = F.interpolate(img, (h, w), mode='bilinear')
+            logits = model(img)
+            logit = logits[0]
+        else:
+            # sliding-window testing
+            # if min(h,w) is smaller than crop_size[0], the smaller edge of the
+            # image will be matched to crop_size[0] maintaining the aspect ratio
+            if min(h,w) < crop_size[0]:
+                new_short = crop_size[0]
+                if h > w :
+                    new_h, new_w = int(new_short * h / w), new_short
+                else:
+                    new_h, new_w = new_short, int(new_short * w / h)
+                h, w = new_h, new_w
+            img = F.interpolate(img, (h, w), mode='bilinear')
+            logit = slide_inference(model, img, crop_size, stride_size, num_classes)
+
+        logit = F.interpolate(logit, ori_shape, mode='bilinear', align_corners=False)  
+        logit = F.softmax(logit, axis=1)
+        final_logit = final_logit + logit
+        # flip_horizontal testing
+        if flip_horizontal == True:
+            img_flip = img[:, :, :, ::-1]
+            logit_flip = slide_inference(model, img_flip, crop_size, 
+                stride_size, num_classes)
+            logit = logit_flip[:, :, :, ::-1]
+            logit = F.interpolate(logit, ori_shape, mode='bilinear', align_corners=False)  
+            logit = F.softmax(logit, axis=1)
+            final_logit = final_logit + logit
+        # TODO (wutianyiRosun@gmail.com): add flip_vertical testing
+    pred = paddle.argmax(final_logit, axis=1, keepdim=True, dtype='int32')
+    return pred
diff --git a/semantic_segmentation/src/datasets/__init__.py b/semantic_segmentation/src/datasets/__init__.py
new file mode 100644
index 00000000..23fd6bd5
--- /dev/null
+++ b/semantic_segmentation/src/datasets/__init__.py
@@ -0,0 +1,59 @@
+from .dataset import Dataset
+from .cityscapes import Cityscapes
+from .ade import ADE20K
+from .pascal_context import PascalContext
+from .vaihingen import Vaihingen
+from .trans10k_v2 import Trans10kV2
+
+
+def get_dataset(config, data_transform, mode='train'):
+    if config.DATA.DATASET == "PascalContext":
+        if mode == 'train':
+            dataset = PascalContext(
+                transforms=data_transform, dataset_root=config.DATA.DATA_PATH, 
+                num_classes=config.DATA.NUM_CLASSES, mode='train')
+        elif mode == 'val':
+            dataset = PascalContext(
+                transforms=data_transform, dataset_root=config.DATA.DATA_PATH,
+                num_classes=config.DATA.NUM_CLASSES, mode='val')
+
+    elif config.DATA.DATASET == "Cityscapes":
+        if mode == 'train':
+            dataset = Cityscapes(
+                transforms=data_transform, dataset_root=config.DATA.DATA_PATH, 
+                num_classes=config.DATA.NUM_CLASSES, mode='train')
+        elif mode == 'val':
+            dataset =  Cityscapes(
+                transforms=data_transform, dataset_root=config.DATA.DATA_PATH, 
+                num_classes=config.DATA.NUM_CLASSES, mode='val')
+
+    elif config.DATA.DATASET == "ADE20K":
+        if mode == 'train':
+            dataset = ADE20K(
+                transforms=data_transform, dataset_root=config.DATA.DATA_PATH, 
+                num_classes=config.DATA.NUM_CLASSES, mode='train')
+        elif mode == 'val':
+            dataset =  ADE20K(
+                transforms=data_transform, dataset_root=config.DATA.DATA_PATH, 
+                num_classes=config.DATA.NUM_CLASSES, mode='val')
+
+    elif config.DATA.DATASET == "Vaihingen":
+        if mode == 'train':
+            dataset = Vaihingen(
+                transforms=data_transform, dataset_root=config.DATA.DATA_PATH, 
+                num_classes=config.DATA.NUM_CLASSES, mode='train')
+        elif mode == 'val':
+            dataset =  Vaihingen(
+                transforms=data_transform, dataset_root=config.DATA.DATA_PATH, 
+                num_classes=config.DATA.NUM_CLASSES, mode='val')
+    elif config.DATA.DATASET == "Trans10kV2":
+        if mode == 'train':
+            dataset = Trans10kV2(transforms=data_transform,                                                                                                                                                                                   
+                dataset_root=config.DATA.DATA_PATH, num_classes=config.DATA.NUM_CLASSES, mode='train')
+        elif mode == 'val':
+            dataset =  Trans10kV2(transforms=data_transform,
+                dataset_root=config.DATA.DATA_PATH, num_classes=config.DATA.NUM_CLASSES, mode='val')
+    else:
+        raise NotImplementedError("{} dataset is not supported".format(config.DATA.DATASET))
+
+    return dataset
diff --git a/semantic_segmentation/src/datasets/ade.py b/semantic_segmentation/src/datasets/ade.py
new file mode 100644
index 00000000..f5d505a8
--- /dev/null
+++ b/semantic_segmentation/src/datasets/ade.py
@@ -0,0 +1,66 @@
+import os
+import numpy as np
+from PIL import Image
+from src.datasets import Dataset
+from src.transforms import Compose
+import src.transforms.functional as F
+
+
+class ADE20K(Dataset):
+    """ADE20K 
+    
+    It is a densely annotated dataset with the instances of stuff, objects, 
+    and parts, covering a diverse set of visual concepts in scenes. The 
+    annotated images cover the scene categories from the SUN and Places database.
+   
+    Args:
+        transforms (list): A list of image transformations.
+        dataset_root (str, optional): The ADK20K dataset directory. Default: None.
+        mode (str, optional): A subset of the entire dataset. 
+                              It should be one of ('train', 'val'). Default: 'train'.
+        num_classes (int): the number of classes
+    """
+
+    def __init__(self, transforms, dataset_root=None, mode='train', num_classes=150):
+        super(ADE20K, self).__init__(transforms=transforms, num_classes=num_classes,
+            dataset_root=dataset_root, mode=mode)
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = num_classes
+        self.ignore_index = 255
+
+        if mode not in ['train', 'val']:
+            raise ValueError("`mode` should be one of ('train', 'val') in"
+                             "ADE20K dataset, but got {}.".format(mode))
+        if mode == 'train':
+            img_dir = os.path.join(self.dataset_root, 'images/training')
+            label_dir = os.path.join(self.dataset_root, 'annotations/training')
+        elif mode == 'val':
+            img_dir = os.path.join(self.dataset_root, 'images/validation')
+            label_dir = os.path.join(self.dataset_root,'annotations/validation')
+        img_files = os.listdir(img_dir)
+        label_files = [i.replace('.jpg', '.png') for i in img_files]
+        for i in range(len(img_files)):
+            img_path = os.path.join(img_dir, img_files[i])
+            label_path = os.path.join(label_dir, label_files[i])
+            self.file_list.append([img_path, label_path])
+
+    def __getitem__(self, idx):
+        image_path, label_path = self.file_list[idx]
+        if self.mode == 'val':
+            img, _ = self.transforms(img=image_path)
+            label = np.asarray(Image.open(label_path))
+            # The class 0 is ignored. And it will equal to 255 after
+            # subtracted 1, because the dtype of label is uint8.
+            label = label - 1
+            label = label[np.newaxis, :, :]
+            return img, label
+        else:
+            img, label = self.transforms(img=image_path, label=label_path)
+            label = label - 1
+            # Recover the ignore pixels adding by transform
+            label[label == 254] = 255
+            return img, label
diff --git a/semantic_segmentation/src/datasets/cityscapes.py b/semantic_segmentation/src/datasets/cityscapes.py
new file mode 100644
index 00000000..70c6ff06
--- /dev/null
+++ b/semantic_segmentation/src/datasets/cityscapes.py
@@ -0,0 +1,50 @@
+import os
+import glob
+from src.datasets import Dataset
+from src.transforms import Compose
+
+
+class Cityscapes(Dataset):
+    """Cityscapes 
+    
+    It contains a diverse set of stereo video sequences recorded in street 
+    scenes from 50 different cities, with high quality pixel-level annotations 
+    of 5000 frames in addition to a larger set of 20000 weakly annotated frames.
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): Cityscapes dataset directory.
+        mode (str, optional): Which part of dataset to use. Default: 'train'.
+        num_classes (int): the number of classes
+    """
+
+    def __init__(self, transforms, dataset_root, mode='train', num_classes=19):
+        super(Cityscapes, self).__init__(transforms=transforms, 
+            num_classes=num_classes, dataset_root=dataset_root, mode=mode)
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        self.file_list = list()
+        mode = mode.lower()
+        self.mode = mode
+        self.num_classes = self.num_classes
+        self.ignore_index = 255
+
+        img_dir = os.path.join(self.dataset_root, 'leftImg8bit')
+        label_dir = os.path.join(self.dataset_root, 'gtFine')
+        if self.dataset_root is None or not os.path.isdir(
+                self.dataset_root) or not os.path.isdir(
+                    img_dir) or not os.path.isdir(label_dir):
+            raise ValueError("The dataset is not Found or the folder structure" 
+                             "is nonconfoumance.")
+
+        label_files = sorted(
+            glob.glob(os.path.join(label_dir, mode, '*',
+                '*_gtFine_labelTrainIds.png')))
+        img_files = sorted(
+            glob.glob(os.path.join(img_dir, mode, '*', '*_leftImg8bit.png')))
+
+        self.file_list = [[
+            img_path, label_path
+        ] for img_path, label_path in zip(img_files, label_files)]
+
+        print("mode: {}, file_nums: {}".format(mode, len(self.file_list)))
diff --git a/semantic_segmentation/src/datasets/cocostuff.py b/semantic_segmentation/src/datasets/cocostuff.py
new file mode 100644
index 00000000..927178b2
--- /dev/null
+++ b/semantic_segmentation/src/datasets/cocostuff.py
@@ -0,0 +1,47 @@
+import os
+import glob
+from src.datasets import Dataset
+from src.transforms import Compose
+
+
+class CocoStuff(Dataset):
+    """CocoStuff
+
+    COCO-Stuff dataset `https://github.com/nightrome/cocostuff`.
+    There are 164k images in COCO-stuff dataset that span over 172 categories 
+    including 80 things, 91 stuff, and 1 unlabeled class.
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): Cityscapes dataset directory.
+        mode (str): Which part of dataset to use, train or val. Default: 'train'.
+        num_classes (int): the number of classes
+    """
+
+    def __init__(self, transforms, dataset_root, mode='train', num_classes=172):
+        super(CocoStuff, self).__init__(transforms=transforms, 
+            num_classes=num_classes, dataset_root=dataset_root, mode=mode)
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        self.file_list = list()
+        mode = mode.lower()
+        self.mode = mode
+        self.num_classes = num_classes
+        self.ignore_index = 255
+        if mode not in ['train', 'val']:
+            raise ValueError("mode should be 'train', 'val',"
+                             "but got {}.".format(mode))
+        img_dir = os.path.join(self.dataset_root, 'images')
+        label_dir = os.path.join(self.dataset_root, 'annotations')
+        if self.dataset_root is None or not os.path.isdir(
+                self.dataset_root) or not os.path.isdir(
+                    img_dir) or not os.path.isdir(label_dir):
+            raise ValueError("The dataset is not Found or the folder structure"
+                             "is nonconfoumance.")
+        label_files = sorted(
+            glob.glob(os.path.join(label_dir, mode + '2017', '*.png')))
+        img_files = sorted(
+            glob.glob(os.path.join(img_dir, mode + '2017', '*.jpg')))
+        self.file_list = [[
+            img_path, label_path
+        ] for img_path, label_path in zip(img_files, label_files)]
diff --git a/semantic_segmentation/src/datasets/dataset.py b/semantic_segmentation/src/datasets/dataset.py
new file mode 100644
index 00000000..5062c88d
--- /dev/null
+++ b/semantic_segmentation/src/datasets/dataset.py
@@ -0,0 +1,72 @@
+import os
+import paddle
+import numpy as np
+from PIL import Image
+from src.transforms import Compose
+import src.transforms.functional as F
+
+
+class Dataset(paddle.io.Dataset):
+    """
+    The custom dataset that conforms to the format.
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory.
+        num_classes (int): Number of classes.
+        mode (str, optional): which part of dataset to use. it is one of 
+        ('train', 'val', 'test'). Default: 'train'.
+        train_path (str, optional): The train dataset file. When mode is 
+        'train', train_path is necessary.
+        val_path (str. optional): The evaluation dataset file. When mode 
+        is 'val', val_path is necessary. The contents is the same as train_path
+        test_path (str, optional): The test dataset file. When mode is 'test', 
+        test_path is necessary. 
+        ignore_index (int): ignore label, default=255
+
+    """
+
+    def __init__(self,
+                 transforms,
+                 dataset_root,
+                 num_classes,
+                 mode='train',
+                 train_path=None,
+                 val_path=None,
+                 test_path=None,
+                 ignore_index=255):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        self.file_list = list()
+        mode = mode.lower()
+        self.mode = mode
+        self.num_classes = num_classes
+        self.ignore_index = ignore_index
+
+        if mode.lower() not in ['train', 'val', 'test']:
+            raise ValueError("mode should be 'train', 'val' or 'test', "
+                             "but got {}.".format(mode))
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+        self.dataset_root = dataset_root
+        if not os.path.exists(self.dataset_root):
+            raise FileNotFoundError("there is not `dataset_root`: {}."
+                                    .format(self.dataset_root))
+
+    def __getitem__(self, idx):
+        image_path, label_path = self.file_list[idx]
+        if self.mode == 'test':
+            img, _ = self.transforms(img=image_path)
+            img = img[np.newaxis, ...]
+            return img, image_path
+        elif self.mode == 'val':
+            img, _ = self.transforms(img=image_path)
+            label = np.asarray(Image.open(label_path).convert('P'))
+            label = label[np.newaxis, :, :]
+            return img, label
+        else:
+            img, label = self.transforms(img=image_path, label=label_path)
+            return img, label
+
+    def __len__(self):
+        return len(self.file_list)
diff --git a/semantic_segmentation/src/datasets/pascal_context.py b/semantic_segmentation/src/datasets/pascal_context.py
new file mode 100644
index 00000000..f6c2e6e3
--- /dev/null
+++ b/semantic_segmentation/src/datasets/pascal_context.py
@@ -0,0 +1,67 @@
+import os
+from PIL import Image
+from src.datasets import Dataset
+from src.transforms import Compose
+
+
+class PascalContext(Dataset):
+    """PascalContext
+
+    This dataset is a set of additional annotations for PASCAL VOC 2010. It goes
+    beyond the original PASCAL semantic segmentation task by providing annotations 
+    for the whole scene. The statistics section has a full list of 400+ labels. 
+    Here, we choose 59 foreground and 1 background class for training segmentation 
+    models. (The ``img`` is fixed to '.jpg' and ``label`` is fixed to '.png'.)
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        mode (str): Which part of dataset to use. ('train', 'trainval', 
+                    'context', 'val').
+        num_classes (int): the number of classes
+    """
+
+    def __init__(self, transforms=None, dataset_root=None, mode='train', 
+            num_classes=60):
+        super(PascalContext, self).__init__(transforms=transforms, 
+            num_classes=num_classes, dataset_root=dataset_root, mode=mode)
+
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = num_classes
+        self.ignore_index = 255
+
+        if mode not in ['train', 'trainval', 'val']:
+            raise ValueError("`mode` should be one of ('train', 'trainval', 'val')"
+                             "in PascalContext dataset, but got {}.".format(mode))
+
+        if self.dataset_root is None:
+            raise ValueError("the path of this dataset is None")
+        
+        image_set_dir = os.path.join(
+            self.dataset_root, 'ImageSets', 'SegmentationContext')
+
+        if mode == 'train':
+            file_path = os.path.join(image_set_dir, 'train.txt')
+        elif mode == 'val':
+            file_path = os.path.join(image_set_dir, 'val.txt')
+            #file_path = os.path.join(image_set_dir, 'val_mini.txt')
+        elif mode == 'trainval':
+            file_path = os.path.join(image_set_dir, 'trainval.txt')
+        print("file_path: ", file_path)
+        if not os.path.exists(file_path):
+            raise RuntimeError("PASCAL-Context annotations are not ready.")
+
+        img_dir = os.path.join(self.dataset_root, 'JPEGImages')
+        label_dir = os.path.join(self.dataset_root, 'SegmentationClassContext')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                image_path = os.path.join(img_dir, ''.join([line, '.jpg']))
+                label_path = os.path.join(label_dir, ''.join([line, '.png']))
+                self.file_list.append([image_path, label_path])
+        print("mode: {}, file_nums: {}".format(mode, len(self.file_list)))
diff --git a/semantic_segmentation/src/datasets/trans10k_v2.py b/semantic_segmentation/src/datasets/trans10k_v2.py
new file mode 100644
index 00000000..97b25514
--- /dev/null
+++ b/semantic_segmentation/src/datasets/trans10k_v2.py
@@ -0,0 +1,47 @@
+import os
+import glob
+from src.datasets import Dataset
+from src.transforms import Compose
+
+
+class Trans10kV2(Dataset):
+    """Trans10kV2 
+    
+    It contains the first extensive transparent object segmentation dataset,
+    which contains 11 fine-grained transparent object categories
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): Trans10kV2 dataset directory.
+        mode (str, optional): Which part of dataset to use. Default: 'train'.
+        num_classes (int): the number of classes
+    """
+
+    def __init__(self, transforms, dataset_root, mode='train', num_classes=12):
+        super(Trans10kV2, self).__init__(transforms=transforms, 
+            num_classes=num_classes, dataset_root=dataset_root, mode=mode)
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        self.file_list = list()
+        mode = mode.lower()
+        self.mode = mode
+        self.num_classes = num_classes
+        self.ignore_index = 255
+        if mode == 'val':
+            mode = 'validation'
+        img_dir = os.path.join(self.dataset_root, mode, 'images')
+        label_dir = os.path.join(self.dataset_root, mode, 'masks_12')
+        if self.dataset_root is None or not os.path.isdir(
+                self.dataset_root) or not os.path.isdir(
+                    img_dir) or not os.path.isdir(label_dir):
+            raise ValueError("The dataset is not Found or the folder structure" 
+                             "is nonconfoumance.")
+
+        label_files = sorted(glob.glob(os.path.join(label_dir, '*_mask.png')), key=lambda x: x.split('_m')[0])
+        img_files = sorted(glob.glob(os.path.join(img_dir, '*.jpg')), key=lambda x: x.split('.')[0])
+
+        self.file_list = [[
+            img_path, label_path
+        ] for img_path, label_path in zip(img_files, label_files)]
+
+        print("mode: {}, file_nums: {}".format(mode, len(self.file_list)))
diff --git a/semantic_segmentation/src/datasets/vaihingen.py b/semantic_segmentation/src/datasets/vaihingen.py
new file mode 100644
index 00000000..a08bce8e
--- /dev/null
+++ b/semantic_segmentation/src/datasets/vaihingen.py
@@ -0,0 +1,70 @@
+import os
+import numpy as np
+from PIL import Image
+from src.datasets import Dataset
+from src.transforms import Compose
+import src.transforms.functional as F
+
+
+class Vaihingen(Dataset):
+    """Vaihingen 
+    
+    ISPRS 2D Semantic Labeling Contest
+    https://www2.isprs.org/commissions/comm2/wg4/benchmark/2d-sem-label-vaihingen/
+    
+    aistudio: https://aistudio.baidu.com/aistudio/datasetdetail/103733
+   
+    Args:
+        transforms (list): A list of image transformations.
+        dataset_root (str, optional): The Vaihingen dataset directory. Default: None.
+        mode (str, optional): A subset of the entire dataset. It should be 
+        one of ('train', 'val'). Default: 'train'.
+        num_classes (int): the number of classes
+    """
+
+    def __init__(self, transforms, dataset_root=None, mode='train', num_classes=6):
+        super(Vaihingen, self).__init__(
+            transforms=transforms, num_classes=num_classes, 
+            dataset_root=dataset_root, mode=mode)
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = num_classes
+        self.ignore_index = 255
+
+        if mode not in ['train', 'val']:
+            raise ValueError("`mode` should be one of ('train', 'val') in "
+                             "Vaihingen dataset, but got {}.".format(mode))
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+        if mode == 'train':
+            img_dir = os.path.join(self.dataset_root, 'images/training')
+            label_dir = os.path.join(self.dataset_root, 'annotations/training')
+        elif mode == 'val':
+            img_dir = os.path.join(self.dataset_root, 'images/validation')
+            label_dir = os.path.join(self.dataset_root,'annotations/validation')
+        img_files = os.listdir(img_dir)
+        label_files = [i.replace('.tif', '_noBoundary.png') for i in img_files]
+        for i in range(len(img_files)):
+            img_path = os.path.join(img_dir, img_files[i])
+            label_path = os.path.join(label_dir, label_files[i])
+            self.file_list.append([img_path, label_path])
+
+    def __getitem__(self, idx):
+        image_path, label_path = self.file_list[idx]
+        if self.mode == 'val':
+            img, _ = self.transforms(img=image_path)
+            label = np.asarray(Image.open(label_path))
+            # The class 0 is ignored. And it will equal to 255 after
+            # subtracted 1, because the dtype of label is uint8.
+            label = label - 1
+            label = label[np.newaxis, :, :]
+            return img, label
+        else:
+            img, label = self.transforms(img=image_path, label=label_path)
+            label = label - 1
+            # Recover the ignore pixels adding by transform
+            label[label == 254] = 255
+            return img, label
diff --git a/semantic_segmentation/src/models/__init__.py b/semantic_segmentation/src/models/__init__.py
new file mode 100644
index 00000000..e57e18ac
--- /dev/null
+++ b/semantic_segmentation/src/models/__init__.py
@@ -0,0 +1,22 @@
+from .setr import SETR
+from .upernet import UperNet
+from .dpt import DPTSeg
+from .segmentor import Segmentor
+from .trans2seg import Trans2Seg
+from .segformer import Segformer
+
+
+def get_model(config):
+    if "SETR" in config.MODEL.NAME:
+       model = SETR(config)
+    elif "UperNet" in config.MODEL.NAME:
+       model = UperNet(config)
+    elif "DPT" in config.MODEL.NAME:
+       model = DPTSeg(config)
+    elif "Segmenter" in config.MODEL.NAME:
+       model = Segmentor(config)
+    elif 'Trans2Seg' in config.MODEL.NAME:
+       model = Trans2Seg(config)
+    elif "Segformer" in config.MODEL.NAME:
+       model = Segformer(config)
+    return model
diff --git a/semantic_segmentation/src/models/backbones/__init__.py b/semantic_segmentation/src/models/backbones/__init__.py
new file mode 100644
index 00000000..a072010e
--- /dev/null
+++ b/semantic_segmentation/src/models/backbones/__init__.py
@@ -0,0 +1,6 @@
+from .vit_mla import ViT_MLA
+from .vit import VisualTransformer
+from .swin_transformer import SwinTransformer
+from .deit import Deit
+from .resnet import *
+from .trans2seg_transformer import *
\ No newline at end of file
diff --git a/semantic_segmentation/src/models/backbones/deit.py b/semantic_segmentation/src/models/backbones/deit.py
new file mode 100644
index 00000000..489db3d4
--- /dev/null
+++ b/semantic_segmentation/src/models/backbones/deit.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement DeiT
+"""
+
+import copy
+import paddle
+import paddle.nn as nn
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+
+    The output of this layer is the input without any change.
+    Use this layer to avoid using 'if' condition in forward methods
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embeddings
+
+    Then a proj (conv2d) layer is applied as the patch embedding.
+
+    Args:
+        image_size: int, input image size, default: 224
+        patch_size: int, patch size for patch embedding (k and stride for proj conv), default: 8
+        in_channels: int, input channels, default: 3
+        embed_dim: int, output dimension of patch embedding, default: 384
+    """
+    def __init__(self,
+                 image_size=(224, 224),
+                 patch_size=8,
+                 in_channels=3,
+                 embed_dim=384):
+        super().__init__()
+        assert patch_size in [4, 8, 16]
+
+        # define patch embeddings
+        self.proj = nn.Conv2D(in_channels,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+        # num patches
+        self.num_patches = (image_size[0] // patch_size) * (image_size[1] // patch_size)
+
+    def forward(self, x):
+        x = self.proj(x)
+        x = x.flatten(2)
+        x = x.transpose([0, 2, 1])
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+
+    def __init__(self, in_features, hidden_features, dropout=0.):
+        super(Mlp, self).__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             in_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class Attention(nn.Layer):
+    """ Attention
+
+    Regular Attention module same as ViT
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.embed_dim = dim
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.softmax = nn.Softmax(axis=-1)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(dropout)
+
+    def transpose_multihead(self, x):
+        new_shape = x.shape[:-1] + [self.num_heads, self.dim_head]
+        x = x.reshape(new_shape)
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def forward(self, x):
+        qkv = self.qkv(x).chunk(3, axis=-1)
+        q, k, v = map(self.transpose_multihead, qkv)
+
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = attn * self.scale
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 2, 1, 3])
+
+        new_shape = z.shape[:-2] + [self.embed_dim]
+        z = z.reshape(new_shape)
+        z = self.proj(z)
+        z = self.proj_dropout(z)
+
+        return z
+
+
+class EncoderLayer(nn.Layer):
+    """Transformer Encoder Layer
+
+    Transformer encoder module, same as ViT
+
+    Args:
+        dim: int, all heads dimension
+        num_heads: int, num of heads
+        mlp_ratio: float, ratio to multiply with dim for mlp hidden feature dim, default: 4.
+        qkv_bias: bool, if True, qkv linear layer is using bias, default: False
+        qk_scale: float, if None, qk_scale is dim_head ** -0.5, default: None
+        attention_dropout: float, dropout rate for attention dropout, default: 0.
+        dropout: float, dropout rate for projection dropout, default: 0.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0,
+                 droppath=0.):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.attn = Attention(dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              attention_dropout=attention_dropout)
+        #self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+        self.drop_path = Identity()
+        self.norm2 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio))
+
+    def forward(self, x):
+        h = x
+        x = self.norm1(x)
+        x = self.attn(x)
+        x = self.drop_path(x)
+        x = h + x
+
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = h + x
+
+        return x
+
+
+class Deit(nn.Layer):
+    """
+    DeiT model for backbone
+    Args:
+    class_token: shape:[1, 1, embed_dim]
+    distill_token: shape:[1, 1, embed_dim]
+    pos_embed: shape:[
+        1,
+        1,
+        image_size[0] // patch_size * image_size[1] // patch_size + 2]
+    out_indices: list of int, negative value not supported!
+    """
+    def __init__(self, config):
+        super().__init__()
+        in_channels = config.MODEL.TRANS.IN_CHANNELS
+        patch_size = config.MODEL.TRANS.PATCH_SIZE
+        image_size = config.DATA.CROP_SIZE
+        embed_dim = config.MODEL.TRANS.HIDDEN_SIZE
+        self.out_indices = config.MODEL.ENCODER.OUT_INDICES
+        # patch embedding
+        self.patch_embed = PatchEmbedding(image_size=image_size,
+                                          patch_size=patch_size,
+                                          in_channels=in_channels,
+                                          embed_dim=embed_dim)
+        # class token
+        self.class_token = paddle.create_parameter(
+            shape=[1, 1, embed_dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0.))
+        # distillation token
+        self.distill_token = paddle.create_parameter(
+            shape=[1, 1, embed_dim],
+            dtype='float32',
+            default_initializer=nn.initializer.TruncatedNormal(std=.02))
+        # positional embedding
+        self.pos_embed = paddle.create_parameter(
+            shape=[1, self.patch_embed.num_patches + 2, embed_dim],
+            dtype='float32',
+            default_initializer=nn.initializer.TruncatedNormal(std=.02))
+        self.pos_dropout = nn.Dropout(config.MODEL.DROPOUT)
+
+        self.layers = nn.LayerList([
+            copy.deepcopy(EncoderLayer(dim=embed_dim,
+                                       num_heads=config.MODEL.TRANS.NUM_HEADS,
+                                       mlp_ratio=config.MODEL.TRANS.MLP_RATIO,
+                                       qkv_bias=config.MODEL.TRANS.QKV_BIAS,
+                                       attention_dropout=config.MODEL.ATTENTION_DROPOUT,
+                                       droppath=config.MODEL.DROP_PATH)) 
+                                       for _ in range(config.MODEL.TRANS.NUM_LAYERS)])
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        class_tokens = self.class_token.expand([x.shape[0], -1, -1])
+        distill_tokens = self.distill_token.expand([x.shape[0], -1, -1])
+        x = paddle.concat((class_tokens, distill_tokens, x), axis=1)
+
+        x = x + self.pos_embed
+        x = self.pos_dropout(x)
+
+        feature_list = []
+        for idx, layer in enumerate(self.layers):
+            x = layer(x)
+            if idx in self.out_indices:
+                feature_list.append(x)
+        return feature_list
diff --git a/semantic_segmentation/src/models/backbones/mix_transformer.py b/semantic_segmentation/src/models/backbones/mix_transformer.py
new file mode 100644
index 00000000..81d0a70a
--- /dev/null
+++ b/semantic_segmentation/src/models/backbones/mix_transformer.py
@@ -0,0 +1,442 @@
+"""
+Implement Mix Transformer of Segformer
+Segformer: https://arxiv.org/abs/2105.15203 
+
+Adapted from two repositories below:
+    https://github.com/NVlabs/SegFormer
+    https://github.com/open-mmlab/mmsegmentation
+"""
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from src.utils import load_pretrained_model
+
+
+def to_2tuple(ele):
+    return (ele, ele)
+
+
+def nlc_to_nchw(x, H, W):
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W
+    return x.transpose([0, 2, 1]).reshape([B, C, H, W])
+
+
+def nchw_to_nlc(x):
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose([0, 2, 1])
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+
+    The output of this layer is the input without any change.
+    Use this layer to avoid if condition in some forward methods
+
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def drop_path(self, inputs):
+        """drop path op
+        Args:
+            input: tensor with arbitrary shape                                                                                                                
+            drop_prob: float number of drop path probability, default: 0.0
+            training: bool, if current mode is training, default: False
+        Returns:
+            output: output tensor after drop path
+        """
+        # if prob is 0 or eval mode, return original input
+        if self.drop_prob == 0. or not self.training:
+            return inputs
+        keep_prob = 1 - self.drop_prob
+        keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1
+                                               )  # shape=(N, 1, 1, 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+        random_tensor = random_tensor.floor()  # mask
+        output = inputs.divide(
+            keep_prob
+        ) * random_tensor  # divide is to keep same output expectation
+        return output
+
+    def forward(self, inputs):
+        return self.drop_path(inputs)
+
+
+class PatchEmbed(nn.Layer):
+    """
+    use a conv layer to implement PatchEmbed.
+    odd kernel size perform overlap patch embedding
+    even kernel size perform non-overlap patch embedding
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (dict, optional): The config dict for conv layers type
+            selection. Default: None.
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: None (Default to be equal with kernel_size).
+        padding (int): The padding length of embedding conv. Default: 0.
+        pad_to_patch_size (bool, optional): Whether to pad feature map shape
+            to multiple patch size. Default: True.
+    """
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 kernel_size=16,
+                 stride=16,
+                 padding=0,
+                 pad_to_patch_size=True):
+        super(PatchEmbed, self).__init__()
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+        self.pad_to_patch_size = pad_to_patch_size
+
+        # The default setting of patch size is equal to kernel size.
+        patch_size = kernel_size
+        if isinstance(patch_size, int):
+            patch_size = to_2tuple(patch_size)
+        elif isinstance(patch_size, tuple):
+            if len(patch_size) == 1:
+                patch_size = to_2tuple(patch_size[0])
+            assert len(patch_size) == 2, \
+                f'The size of patch should have length 1 or 2, ' \
+                f'but got {len(patch_size)}'
+        self.patch_size = patch_size
+
+        # Use conv layer to embed
+        self.projection = nn.Conv2D(in_channels=in_channels,
+                                    out_channels=embed_dims,
+                                    kernel_size=kernel_size,
+                                    stride=stride,
+                                    padding=padding)
+        self.norm = nn.LayerNorm(embed_dims)
+
+    def forward(self, x):
+        H, W = x.shape[2], x.shape[3]
+        # TODO: Process overlapping op
+        if self.pad_to_patch_size:
+            # Modify H, W to multiple of patch size.
+            if H % self.patch_size[0] != 0:
+                x = F.pad(
+                    x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+            if W % self.patch_size[1] != 0:
+                x = F.pad(
+                    x, (0, self.patch_size[1] - W % self.patch_size[1], 0, 0))
+
+        x = self.projection(x)
+        self.DH, self.DW = x.shape[2], x.shape[3]
+        x = nchw_to_nlc(x)
+        x = self.norm(x)
+
+        return x
+
+
+class MixFFN(nn.Layer):
+    """An implementation of MixFFN of Segformer.
+
+    The differences between MixFFN & FFN:
+        1. Use 1X1 Conv to replace Linear layer.
+        2. Introduce 3X3 Conv to encode positional information.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+    """
+    def __init__(self, embed_dims, feedforward_channels, ffn_drop=0.):
+        super(MixFFN, self).__init__()
+
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        in_channels = embed_dims
+
+        self.act = nn.GELU()
+        self.fc1 = nn.Conv2D(in_channels=in_channels,
+                             out_channels=feedforward_channels,
+                             kernel_size=1,
+                             stride=1,
+                             bias_attr=None)
+        # 3x3 depth wise conv to provide positional encode information
+        self.pe_conv = nn.Conv2D(in_channels=feedforward_channels,
+                                 out_channels=feedforward_channels,
+                                 kernel_size=3,
+                                 stride=1,
+                                 padding=(3 - 1) // 2,
+                                 bias_attr=None,
+                                 groups=feedforward_channels)
+        self.fc2 = nn.Conv2D(in_channels=feedforward_channels,
+                             out_channels=in_channels,
+                             kernel_size=1,
+                             stride=1,
+                             bias_attr=None)
+        self.drop = nn.Dropout(ffn_drop)
+
+    def forward(self, x, H, W):
+        x = nlc_to_nchw(x, H, W)
+        x = self.fc1(x)
+        x = self.pe_conv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        x = nchw_to_nlc(x)
+
+        return x
+
+
+class EfficientAttention(nn.Layer):
+    """ An implementation of Efficient Multi-head Attention of Segformer.
+    
+    """
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1):
+        super(EfficientAttention, self).__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.num_heads = num_heads
+
+        w_attr_0, b_attr_0 = self._init_weights()
+        w_attr_1, b_attr_1 = self._init_weights()
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.q = nn.Linear(dim,
+                           dim,
+                           weight_attr=w_attr_0,
+                           bias_attr=b_attr_0 if qkv_bias else False)
+        self.kv = nn.Linear(dim,
+                            dim * 2,
+                            weight_attr=w_attr_1,
+                            bias_attr=b_attr_1 if qkv_bias else False)
+        self.proj = nn.Linear(dim,
+                              dim,
+                              weight_attr=w_attr_2,
+                              bias_attr=b_attr_2)
+
+        self.scales = (dim // num_heads)**-0.5  # 0.125 for Large
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.softmax = nn.Softmax(axis=-1)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2D(dim,
+                                dim,
+                                kernel_size=sr_ratio,
+                                stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(
+            initializer=nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x)
+
+        if self.sr_ratio > 1:
+            x_ = nlc_to_nchw(x, H, W)
+            x_ = self.sr(x_)
+            x_ = nchw_to_nlc(x_)
+            x_ = self.norm(x_)
+            kv = self.kv(x_).reshape([B,-1,2,C]).transpose([2, 0, 1,3])
+        else:
+            kv =  self.kv(x).reshape([B,-1,2,C]).transpose([2, 0, 1,3])
+        k, v = kv[0], kv[1]
+
+        q, k, v = [x.transpose([1,0,2]) for x in (q, k, v)]
+        q, k, v = [x.reshape([-1,B*self.num_heads,C//self.num_heads]) for x in (q, k, v)]
+        q, k, v = [x.transpose([1,0,2]) for x in (q, k, v)]
+        attn = paddle.matmul(q, k, transpose_y=True)* self.scales
+        attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+
+        x = paddle.matmul(attn, v).transpose([1,0,2]).reshape([N, B, C])
+        x = self.proj(x).transpose([1,0,2])
+        x = self.proj_drop(x)
+
+        return x
+
+
+class TransformerEncoderLayer(nn.Layer):
+    """Implements one encoder layer in Segformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            after the feed forward layer. Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True.
+        sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
+            Attention of Segformer. Default: 1.
+    """
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 sr_ratio=1):
+        super(TransformerEncoderLayer, self).__init__()
+
+        self.norm1 = nn.LayerNorm(embed_dims)
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0. else Identity()
+
+        self.attn = EfficientAttention(dim=embed_dims,
+                                       num_heads=num_heads,
+                                       attn_drop=attn_drop_rate,
+                                       proj_drop=drop_rate,
+                                       qkv_bias=qkv_bias,
+                                       sr_ratio=sr_ratio)
+        self.norm2 = nn.LayerNorm(embed_dims)
+        self.ffn = MixFFN(embed_dims=embed_dims,
+                          feedforward_channels=feedforward_channels,
+                          ffn_drop=drop_rate)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.ffn(self.norm2(x), H, W))
+        return x
+
+
+class MixVisionTransformer(nn.Layer):
+    """The backbone of Segformer.
+
+    A Paddle implement of : `SegFormer: Simple and Efficient Design for
+    Semantic Segmentation with Transformers` -
+        https://arxiv.org/pdf/2105.15203.pdf
+
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 768.
+        num_stags (int): The num of stages. Default: 4.
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 4, 8].
+        patch_sizes (Sequence[int]): The patch_size of each overlapped patch
+            embedding. Default: [7, 3, 3, 3].
+        strides (Sequence[int]): The stride of each overlapped patch embedding.
+            Default: [4, 2, 2, 2].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer encode layer. Default: [8, 4, 2, 1].
+        out_indices (Sequence[int] | int): Output from which stages.
+            Default: (0, 1, 2, 3).
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0
+        pretrained (str, optional): model pretrained path. Default: None.
+    """
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=64,
+                 num_stages=4,
+                 num_layers=[3, 4, 6, 3],
+                 num_heads=[1, 2, 4, 8],
+                 patch_sizes=[7, 3, 3, 3],
+                 strides=[4, 2, 2, 2],
+                 sr_ratios=[8, 4, 2, 1],
+                 out_indices=(0, 1, 2, 3),
+                 mlp_ratio=4,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 pretrained=None):
+        super(MixVisionTransformer, self).__init__()
+
+        self.embed_dims = embed_dims
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.sr_ratios = sr_ratios
+        assert num_stages == len(num_layers) == len(num_heads) \
+            == len(patch_sizes) == len(strides) == len(sr_ratios)
+
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_stages
+        self.pretrained = pretrained
+
+        dpr = [x for x in paddle.linspace(0, drop_path_rate, sum(num_layers))]
+
+        cur = 0
+        self.layers = nn.LayerList()
+        for i, num_layer in enumerate(num_layers):
+            embed_dims_i = embed_dims * num_heads[i]
+            patch_embed = PatchEmbed(in_channels=in_channels,
+                                     embed_dims=embed_dims_i,
+                                     kernel_size=patch_sizes[i],
+                                     stride=strides[i],
+                                     padding=patch_sizes[i] // 2,
+                                     pad_to_patch_size=False)
+            layer = nn.LayerList([
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims_i,
+                    num_heads=num_heads[i],
+                    feedforward_channels=mlp_ratio * embed_dims_i,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[cur + idx],
+                    qkv_bias=qkv_bias,
+                    sr_ratio=sr_ratios[i]) for idx in range(num_layer)
+            ])
+            in_channels = embed_dims_i
+            norm = nn.LayerNorm(embed_dims_i)
+            self.layers.append(nn.LayerList([patch_embed, layer, norm]))
+            cur += num_layer
+
+        if isinstance(self.pretrained, str):
+            load_pretrained_model(self, self.pretrained)
+
+    def forward(self, x):
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, H, W = layer[0](x), layer[0].DH, layer[0].DW
+            for block in layer[1]:
+                x = block(x, H, W)
+            x = layer[2](x)
+            x = nlc_to_nchw(x, H, W)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return outs
diff --git a/semantic_segmentation/src/models/backbones/resnet.py b/semantic_segmentation/src/models/backbones/resnet.py
new file mode 100644
index 00000000..67c3ea5f
--- /dev/null
+++ b/semantic_segmentation/src/models/backbones/resnet.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement resnet50c backbone
+"""
+
+import os
+import logging
+import paddle
+import paddle.nn as nn
+
+
+class BasicBlockV1b(nn.Layer):
+    """BasicBlockV1b Implement
+    """
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None,
+                 previous_dilation=1, norm_layer=nn.BatchNorm2D):
+        super(BasicBlockV1b, self).__init__()
+        self.conv1 = nn.Conv2D(inplanes, planes, 3, stride,
+                               dilation, dilation, bias_attr=False)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(True)
+        self.conv2 = nn.Conv2D(planes, planes, 3, 1, previous_dilation,
+                               dilation=previous_dilation, bias_attr=False)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class BottleneckV1b(nn.Layer):
+    """BottleneckV1b Implement
+    """
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None,
+                 previous_dilation=1, norm_layer=nn.BatchNorm2D):
+        super(BottleneckV1b, self).__init__()
+        self.conv1 = nn.Conv2D(inplanes, planes, 1, bias_attr=False)
+        self.bn1 = norm_layer(planes)
+        self.conv2 = nn.Conv2D(planes, planes, 3, stride,
+                               dilation, dilation, bias_attr=False)
+        self.bn2 = norm_layer(planes)
+        self.conv3 = nn.Conv2D(planes, planes * self.expansion, 1, bias_attr=False)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNetV1(nn.Layer):
+    """ResNetV1
+    """
+    def __init__(self, block, layers, config, num_classes=1000, deep_stem=False,
+                 zero_init_residual=False, norm_layer=nn.BatchNorm2D):
+        output_stride = config.MODEL.OUTPUT_STRIDE
+        scale = config.MODEL.BACKBONE_SCALE
+        if output_stride == 32:
+            dilations = [1, 1]
+            strides = [2, 2]
+        elif output_stride == 16:
+            dilations = [1, 2]
+            strides = [2, 1]
+        elif output_stride == 8:
+            dilations = [2, 4]
+            strides = [1, 1]
+        else:
+            raise NotImplementedError
+        self.inplanes = int((128 if deep_stem else 64) * scale)
+        super(ResNetV1, self).__init__()
+        if deep_stem:
+            # resnet vc
+            mid_channel = int(64 * scale)
+            self.conv1 = nn.Sequential(
+                nn.Conv2D(3, mid_channel, 3, 2, 1, bias_attr=False),
+                norm_layer(mid_channel),
+                nn.ReLU(True),
+                nn.Conv2D(mid_channel, mid_channel, 3, 1, 1, bias_attr=False),
+                norm_layer(mid_channel),
+                nn.ReLU(True),
+                nn.Conv2D(mid_channel, self.inplanes, 3, 1, 1, bias_attr=False)
+            )
+        else:
+            self.conv1 = nn.Conv2D(3, self.inplanes, 7, 2, 3, bias_attr=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(True)
+        self.maxpool = nn.MaxPool2D(3, 2, 1)
+        self.layer1 = self._make_layer(block, int(64 * scale), layers[0], norm_layer=norm_layer)
+        self.layer2 = self._make_layer(block, int(128 * scale), layers[1], stride=2, norm_layer=norm_layer)
+
+        self.layer3 = self._make_layer(block, int(256 * scale), layers[2], stride=strides[0], dilation=dilations[0],
+                                       norm_layer=norm_layer)
+        self.layer4 = self._make_layer(block, int(512 * scale), layers[3], stride=strides[1], dilation=dilations[1],
+                                       norm_layer=norm_layer, multi_grid=config.MODEL.ENCODER.MULTI_GRID,
+                                       multi_dilation=config.MODEL.ENCODER.MULTI_DILATION)
+
+        self.last_inp_channels = int(512 * block.expansion * scale)
+        self.avgpool = nn.AdaptiveAvgPool2D((1, 1))
+        self.fc = nn.Linear(int(512 * block.expansion * scale), num_classes)
+
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                m.weight = paddle.create_parameter(shape=m.weight.shape,
+                                     dtype='float32', default_initializer=nn.initializer.KaimingNormal())
+            elif isinstance(m, nn.BatchNorm2D):
+                m.weight = paddle.create_parameter(shape=m.weight.shape, dtype='float32',
+                                     default_initializer=nn.initializer.Constant(value=1.0))
+                m.bias = paddle.create_parameter(shape=m.bias.shape, dtype='float32',
+                                     default_initializer=nn.initializer.Constant(value=0.0))
+        
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.sublayers():
+                if isinstance(m, BottleneckV1b):
+                    m.bn3.weight = paddle.create_parameter(shape=m.bn3.weight.shape,
+                                             dtype='float32', default_initializer=nn.initializer.Constant(0.0))
+                elif isinstance(m, BasicBlockV1b):
+                    m.bn2.weight = paddle.create_parameter(shape=m.bn2.weight.shape,
+                                             dtype='float32', default_initializer=nn.initializer.Constant(0.0))
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=nn.BatchNorm2D,
+                    multi_grid=False, multi_dilation=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes, planes * block.expansion, 1, stride, bias_attr=False),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        if not multi_grid:
+            if dilation in (1, 2):
+                layers.append(block(self.inplanes, planes, stride, dilation=1, downsample=downsample,
+                                    previous_dilation=dilation, norm_layer=norm_layer))
+            elif dilation == 4:
+                layers.append(block(self.inplanes, planes, stride, dilation=2, downsample=downsample,
+                                    previous_dilation=dilation, norm_layer=norm_layer))
+            else:
+                raise RuntimeError("=> unknown dilation size: {}".format(dilation))
+        else:
+            layers.append(block(self.inplanes, planes, stride, dilation=multi_dilation[0],
+                                downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
+        self.inplanes = planes * block.expansion
+
+        if multi_grid:
+            div = len(multi_dilation)
+            for i in range(1, blocks):
+                layers.append(block(self.inplanes, planes, dilation=multi_dilation[i % div],
+                                    previous_dilation=dilation, norm_layer=norm_layer))
+        else:
+            for _ in range(1, blocks):
+                layers.append(block(self.inplanes, planes, dilation=dilation,
+                                    previous_dilation=dilation, norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        c1 = self.layer1(x)
+        c2 = self.layer2(c1)
+        c3 = self.layer3(c2)
+        c4 = self.layer4(c3)
+
+        return c1, c2, c3, c4
+
+
+def resnet50c(config, norm_layer=nn.BatchNorm2D):
+    """resnet50c implement
+    
+    The ResNet-50 [Heet al., 2016] with dilation convolution at last stage,
+    ResNet-50 model Ref, https://arxiv.org/pdf/1512.03385.pdf
+
+    Args:
+        config (dict): configuration of network
+        norm_layer: normalization layer type, default, nn.BatchNorm2D
+    """
+    num_block = [3, 4, 6, 3]
+    return ResNetV1(BottleneckV1b, num_block, config, norm_layer=norm_layer, deep_stem=True)
+
+
+def load_backbone_pretrained(model, backbone, config):
+    if config.MODEL.PRETRAINED:
+        if os.path.isfile(config.MODEL.PRETRAINED):
+            logging.info('Load pretrained backbone from local path!')
+            model.set_state_dict(paddle.load(config.MODEL.PRETRAINED))
+
+
+def get_segmentation_backbone(backbone, config, norm_layer=paddle.nn.BatchNorm2D):
+    """
+    Built the backbone model, defined by `config.MODEL.BACKBONE`.
+    """
+    model = resnet50c(config, norm_layer)
+    load_backbone_pretrained(model, backbone, config)
+    return model
diff --git a/semantic_segmentation/src/models/backbones/swin_transformer.py b/semantic_segmentation/src/models/backbones/swin_transformer.py
new file mode 100644
index 00000000..48cccb30
--- /dev/null
+++ b/semantic_segmentation/src/models/backbones/swin_transformer.py
@@ -0,0 +1,596 @@
+"""
+Implement Transformer Class for Swin Transformer
+"""
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+
+    The output of this layer is the input without any change.
+    Use this layer to avoid if condition in some forward methods
+
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return x
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def drop_path(self, inputs):
+        """drop path op
+        Args:
+            input: tensor with arbitrary shape                                                                                                                
+            drop_prob: float number of drop path probability, default: 0.0
+            training: bool, if current mode is training, default: False
+        Returns:
+            output: output tensor after drop path
+        """
+        # if prob is 0 or eval mode, return original input
+        if self.drop_prob == 0. or not self.training:
+            return inputs
+        keep_prob = 1 - self.drop_prob
+        keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+        random_tensor = random_tensor.floor() # mask
+        output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation
+        return output
+
+    def forward(self, inputs):
+        return self.drop_path(inputs)
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embeddings
+
+    Apply patch embeddings on input images. Embeddings is implemented using a Conv2D op.
+
+    Attributes:
+        image_size: int, input image size, default: 224
+        patch_size: int, size of patch, default: 4
+        in_channels: int, input image channels, default: 3
+        embed_dim: int, embedding dimension, default: 96
+    """
+
+    def __init__(self, image_size=224, patch_size=4, in_channels=3, embed_dim=96):
+        super().__init__()
+        #image_size = (image_size, image_size) # TODO: add to_2tuple
+        patch_size = (patch_size, patch_size)
+        patches_resolution = [image_size[0]//patch_size[0], image_size[1]//patch_size[1]]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        self.patch_embed = nn.Conv2D(in_channels=in_channels,
+                                     out_channels=embed_dim,
+                                     kernel_size=patch_size,
+                                     stride=patch_size)
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        _, _, H, W = x.shape
+        # TODO (wutianyiRosun@gmail.com): padding for processing different resolutions of input images
+        #if W % self.patch_size[1] !=0:
+        #   x = F.pad(x (0, self.patch_size[1] - W % self.patch_size[1]))
+        #if H % self.patch_size[1] !=0:
+        #   x = F.pad(x (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+          
+        x = self.patch_embed(x) # [batch, embed_dim, h, w] h,w = patch_resolution
+        x = x.flatten(start_axis=2, stop_axis=-1) # [batch, embed_dim, h*w] h*w = num_patches
+        x = x.transpose([0, 2, 1]) # [batch, h*w, embed_dim]
+        x = self.norm(x) # [batch, num_patches, embed_dim]
+        return x
+
+
+class PatchMerging(nn.Layer):
+    """ Patch Merging class
+
+    Merge multiple patch into one path and keep the out dim.
+    Spefically, merge adjacent 2x2 patches(dim=C) into 1 patch.
+    The concat dim 4*C is rescaled to 2*C
+
+    Attributes:
+        input_resolution: tuple of ints, the size of input
+        dim: dimension of single patch
+        reduction: nn.Linear which maps 4C to 2C dim
+        norm: nn.LayerNorm, applied after linear layer.
+    """
+
+    def __init__(self, input_resolution, dim):
+        super(PatchMerging, self).__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4*dim, 2*dim, bias_attr=False)
+        self.norm = nn.LayerNorm(4*dim)
+
+    def forward(self, x):
+        h, w = self.input_resolution
+        b, _, c = x.shape
+        x = x.reshape([b, h, w, c])
+
+        x0 = x[:, 0::2, 0::2, :] # [B, H/2, W/2, C]
+        x1 = x[:, 1::2, 0::2, :] # [B, H/2, W/2, C]
+        x2 = x[:, 0::2, 1::2, :] # [B, H/2, W/2, C]
+        x3 = x[:, 1::2, 1::2, :] # [B, H/2, W/2, C]
+        x = paddle.concat([x0, x1, x2, x3], -1) #[B, H/2, W/2, 4*C]
+        x = x.reshape([b, -1, 4*c]) # [B, H/2*W/2, 4*C]
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+
+    def __init__(self, in_features, hidden_features, dropout):
+        super(Mlp, self).__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(in_features,
+                             hidden_features,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(hidden_features,
+                             in_features,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class WindowAttention(nn.Layer):
+    """Window based multihead attention, with relative position bias.
+
+    Both shifted window and non-shifted window are supported.
+
+    Attributes:
+        dim: int, input dimension (channels)
+        window_size: int, height and width of the window
+        num_heads: int, number of attention heads
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: True
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        attention_dropout: float, dropout of attention
+        dropout: float, dropout for output
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.):
+        super(WindowAttention, self).__init__()
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.dim = dim
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head ** -0.5
+
+        self.relative_position_bias_table = paddle.create_parameter(
+            shape=[(2 * window_size[0] -1) * (2 * window_size[1] - 1), num_heads],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))
+
+        # relative position index for each token inside window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid([coords_h, coords_w])) # [2, window_h, window_w]
+        coords_flatten = paddle.flatten(coords, 1) # [2, window_h * window_w]
+        # 2, window_h * window_w, window_h * window_h
+        relative_coords = coords_flatten.unsqueeze(2) - coords_flatten.unsqueeze(1)
+        # winwod_h*window_w, window_h*window_w, 2
+        relative_coords = relative_coords.transpose([1, 2, 0])
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2* self.window_size[1] - 1
+        # [window_size * window_size, window_size*window_size]
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def transpose_multihead(self, x):
+        new_shape = x.shape[:-1] + [self.num_heads, self.dim_head]
+        x = x.reshape(new_shape)
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def get_relative_pos_bias_from_pos_index(self):
+        # relative_position_bias_table is a ParamBase object
+        # https://github.com/PaddlePaddle/Paddle/blob/067f558c59b34dd6d8626aad73e9943cf7f5960f/python/paddle/fluid/framework.py#L5727
+        table = self.relative_position_bias_table # N x num_heads
+        # index is a tensor
+        index = self.relative_position_index.reshape([-1]) # window_h*window_w * window_h*window_w
+        # NOTE: paddle does NOT support indexing Tensor by a Tensor
+        relative_position_bias = paddle.index_select(x=table, index=index)
+        return relative_position_bias
+
+    def forward(self, x, mask=None):
+        qkv = self.qkv(x).chunk(3, axis=-1)
+        q, k, v = map(self.transpose_multihead, qkv)
+        q = q * self.scale
+        attn = paddle.matmul(q, k, transpose_y=True)
+        relative_position_bias = self.get_relative_pos_bias_from_pos_index()
+        relative_position_bias = relative_position_bias.reshape(
+            [self.window_size[0] * self.window_size[1],
+             self.window_size[0] * self.window_size[1],
+             -1])
+        # nH, window_h*window_w, window_h*window_w
+        relative_position_bias = relative_position_bias.transpose([2, 0, 1])
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape(
+                [x.shape[0] // nW, nW, self.num_heads, x.shape[1], x.shape[1]])
+            attn += mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, x.shape[1], x.shape[1]])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 2, 1, 3])
+        new_shape = z.shape[:-2] + [self.dim]
+        z = z.reshape(new_shape)
+        z = self.proj(z)
+        z = self.proj_dropout(z)
+
+        return z
+
+
+def windows_partition(x, window_size):
+    """ partite windows into window_size x window_size
+    Args:
+        x: Tensor, shape=[b, h, w, c]
+        window_size: int, window size
+    Returns:
+        x: Tensor, shape=[num_windows*b, window_size, window_size, c]
+    """
+
+    B, H, W, C = x.shape
+    x = x.reshape([B, H//window_size, window_size, W//window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5])
+    x = x.reshape([-1, window_size, window_size, C]) #(num_windows*B, window_size, window_size, C)
+    return x
+
+
+def windows_reverse(windows, window_size, H, W):
+    """ Window reverse
+    Args:
+        windows: (n_windows * B, window_size, window_size, C)
+        window_size: (int) window size
+        H: (int) height of image
+        W: (int) width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape([B, H // window_size, W // window_size, window_size, window_size, -1])
+    x = x.transpose([0, 1, 3, 2, 4, 5])
+    x = x.reshape([B, H, W, -1])
+    return x
+
+
+class SwinTransformerBlock(nn.Layer):
+    """Swin transformer block
+
+    Contains window multi head self attention, droppath, mlp, norm and residual.
+
+    Attributes:
+        dim: int, input dimension (channels)
+        input_resolution: int, input resoultion
+        num_heads: int, number of attention heads
+        windos_size: int, window size, default: 7
+        shift_size: int, shift size for SW-MSA, default: 0
+        mlp_ratio: float, ratio of mlp hidden dim and input embedding dim, default: 4.
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: True
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        dropout: float, dropout for output, default: 0.
+        attention_dropout: float, dropout of attention, default: 0.
+        droppath: float, drop path rate, default: 0.
+    """
+
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, dropout=0.,
+                 attention_dropout=0., droppath=0.):
+        super(SwinTransformerBlock, self).__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = WindowAttention(dim,
+                                    window_size=(self.window_size, self.window_size),
+                                    num_heads=num_heads,
+                                    qkv_bias=qkv_bias,
+                                    qk_scale=qk_scale,
+                                    attention_dropout=attention_dropout,
+                                    dropout=dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else None
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim*mlp_ratio),
+                       dropout=dropout)
+
+        if self.shift_size > 0:
+            H, W = self.input_resolution
+            Hp = int(np.ceil(H / self.window_size)) * self.window_size
+            Wp = int(np.ceil(W / self.window_size)) * self.window_size
+            img_mask = paddle.zeros((1, Hp, Wp, 1))
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            mask_windows = windows_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.reshape((-1, self.window_size * self.window_size))
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = paddle.where(attn_mask != 0,
+                                     paddle.ones_like(attn_mask) * float(-100.0),
+                                     attn_mask)
+            attn_mask = paddle.where(attn_mask == 0,
+                                     paddle.zeros_like(attn_mask),
+                                     attn_mask)
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        h = x
+        x = self.norm1(x)
+
+        new_shape = [B, H, W, C]
+        x = x.reshape(new_shape)
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = x.transpose([0, 3, 1, 2]) # (B,C,H,W)
+        x = F.pad(x, [pad_l, pad_r, pad_t, pad_b]) 
+        x = x.transpose([0, 2, 3, 1]) # (B,H,W,C)
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(x,
+                                    shifts=(-self.shift_size, -self.shift_size),
+                                    axis=(1, 2))
+        else:
+            shifted_x = x
+        x_windows = windows_partition(shifted_x, self.window_size)
+        x_windows = x_windows.reshape([-1, self.window_size * self.window_size, C])
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)
+        attn_windows = attn_windows.reshape([-1, self.window_size, self.window_size, C])
+        shifted_x = windows_reverse(attn_windows, self.window_size, Hp, Wp)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(shifted_x,
+                            shifts=(self.shift_size, self.shift_size),
+                            axis=(1, 2))
+        else:
+            x = shifted_x
+        # remove padding
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :]
+        x = x.reshape([B, H*W, C])
+        if self.drop_path is not None:
+            x = h + self.drop_path(x)
+        else:
+            x = h + x
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        if self.drop_path is not None:
+            x = h + self.drop_path(x)
+        else:
+            x = h + x
+        return x
+
+
+class SwinTransformerStage(nn.Layer):
+    """Stage layers for swin transformer
+
+    Stage layers contains a number of Transformer blocks and an optional
+    patch merging layer, patch merging is not applied after last stage
+
+    Attributes:
+        dim: int, embedding dimension
+        input_resolution: tuple, input resoliution
+        depth: list, num of blocks in each stage
+        blocks: nn.LayerList, contains SwinTransformerBlocks for one stage
+        downsample: PatchMerging, patch merging layer, none if last stage
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, dropout=0.,
+                 attention_dropout=0., droppath=0., downsample=None):
+        super(SwinTransformerStage, self).__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+
+        self.blocks = nn.LayerList()
+        for i in range(depth):
+            self.blocks.append(
+                SwinTransformerBlock(
+                    dim=dim, input_resolution=input_resolution,
+                    num_heads=num_heads, window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias, qk_scale=qk_scale,
+                    dropout=dropout, attention_dropout=attention_dropout,
+                    droppath=droppath[i] if isinstance(droppath, list) else droppath))
+
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        if self.downsample is not None:
+            x_down = self.downsample(x)
+            return [x, x_down] 
+        return [x, x]
+
+
+class SwinTransformer(nn.Layer):
+    """SwinTransformer class
+
+    Attributes:
+        num_classes: int, num of image classes
+        num_stages: int, num of stages contains patch merging and Swin blocks
+        depths: list of int, num of Swin blocks in each stage
+        num_heads: int, num of heads in attention module
+        embed_dim: int, output dimension of patch embedding
+        num_features: int, output dimension of whole network before classifier
+        mlp_ratio: float, hidden dimension of mlp layer is mlp_ratio * mlp input dim
+        qkv_bias: bool, if True, set qkv layers have bias enabled
+        qk_scale: float, scale factor for qk.
+        ape: bool, if True, set to use absolute positional embeddings
+        window_size: int, size of patch window for inputs
+        dropout: float, dropout rate for linear layer
+        dropout_attn: float, dropout rate for attention
+        patch_embedding: PatchEmbedding, patch embedding instance
+        patch_resolution: tuple, number of patches in row and column
+        position_dropout: nn.Dropout, dropout op for position embedding
+        stages: SwinTransformerStage, stage instances.
+        norm: nn.LayerNorm, norm layer applied after transformer
+        avgpool: nn.AveragePool2D, pooling layer before classifer
+        fc: nn.Linear, classifier op.
+    """
+    def __init__(self, config):
+        super(SwinTransformer, self).__init__()
+
+        self.num_classes = config.DATA.NUM_CLASSES
+        self.num_stages = len(config.MODEL.TRANS.STAGE_DEPTHS)
+        self.depths = config.MODEL.TRANS.STAGE_DEPTHS
+        self.num_heads = config.MODEL.TRANS.NUM_HEADS
+        self.embed_dim = config.MODEL.TRANS.EMBED_DIM
+        self.num_features = int(self.embed_dim * 2 ** (self.num_stages - 1))
+        self.mlp_ratio = config.MODEL.TRANS.MLP_RATIO
+        self.qkv_bias = config.MODEL.TRANS.QKV_BIAS
+        self.qk_scale = config.MODEL.TRANS.QK_SCALE
+        self.ape = config.MODEL.TRANS.APE
+        self.window_size = config.MODEL.TRANS.WINDOW_SIZE
+        self.dropout = config.MODEL.DROPOUT
+        self.attention_dropout = config.MODEL.ATTENTION_DROPOUT
+        self.out_indices = config.MODEL.ENCODER.OUT_INDICES
+
+        self.patch_embedding = PatchEmbedding(image_size=config.DATA.CROP_SIZE,
+                                              patch_size=config.MODEL.TRANS.PATCH_SIZE,
+                                              in_channels=config.MODEL.TRANS.IN_CHANNELS,
+                                              embed_dim=config.MODEL.TRANS.EMBED_DIM)
+        num_patches = self.patch_embedding.num_patches
+        self.patches_resolution = self.patch_embedding.patches_resolution
+
+
+        if self.ape:
+            self.absolute_positional_embedding = paddle.nn.ParameterList([
+                paddle.create_parameter(
+                    shape=[1, num_patches, self.embed_dim], dtype='float32',
+                    default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))])
+
+        self.position_dropout = nn.Dropout(config.MODEL.DROPOUT)
+
+        depth_decay = [x for x in paddle.linspace(0, config.MODEL.DROP_PATH, sum(self.depths))]
+
+        self.stages = nn.LayerList()
+        for stage_idx in range(self.num_stages):
+            stage = SwinTransformerStage(
+                dim=int(self.embed_dim * 2 ** stage_idx),
+                input_resolution=(
+                    self.patches_resolution[0] // (2 ** stage_idx),
+                    self.patches_resolution[1] // (2 ** stage_idx)),
+                depth=self.depths[stage_idx],
+                num_heads=self.num_heads[stage_idx],
+                window_size=self.window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=self.qkv_bias,
+                qk_scale=self.qk_scale,
+                dropout=self.dropout,
+                attention_dropout=self.attention_dropout,
+                droppath=depth_decay[
+                    sum(self.depths[:stage_idx]):sum(self.depths[:stage_idx+1])],
+                downsample=PatchMerging if (
+                    stage_idx < self.num_stages-1) else None,
+                )
+            self.stages.append(stage)
+
+    def forward(self, x):
+        x = self.patch_embedding(x)  # (B, HW/16, dim)
+        if self.ape:
+            x = x + self.absolute_positional_embedding
+        x = self.position_dropout(x)
+        outs = []
+        for idx in range(len(self.stages)):
+            x_out, x = self.stages[idx](x)
+            if idx in self.out_indices:
+                outs.append(x_out)
+        return outs
+
diff --git a/semantic_segmentation/src/models/backbones/trans2seg_transformer.py b/semantic_segmentation/src/models/backbones/trans2seg_transformer.py
new file mode 100644
index 00000000..d2ded967
--- /dev/null
+++ b/semantic_segmentation/src/models/backbones/trans2seg_transformer.py
@@ -0,0 +1,404 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement Transformer Class for Trans2seg
+"""
+
+import math
+import paddle
+import warnings
+import paddle.nn as nn
+import paddle.nn.functional as F
+from .swin_transformer import Identity, DropPath, Mlp
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    with paddle.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor=paddle.uniform(shape=tensor.shape, min=2 * l - 1, max=2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor=tensor.erf()
+
+        # Transform to proper mean, std
+        tensor=tensor.multiply(paddle.to_tensor(std * math.sqrt(2.)))
+        tensor=tensor.add(paddle.to_tensor(mean))
+
+        # Clamp to ensure it's in the proper range
+        tensor=tensor.clip(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Attributes:
+        tensor: an n-dimensional `paddle.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = paddle.empty([3, 5])
+        >>> trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def expand(x, nclass):
+    return x.unsqueeze(1).tile([1, nclass, 1, 1, 1]).flatten(0, 1)
+
+
+class Attention_Encoder(nn.Layer):
+    """Attention Encoder Implement
+    
+    multi-head self-attention module
+    
+    Attributes:
+        dim: int, input dimension (channels)
+        num_heads: int, number of attention heads
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: False
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        attn_drop: float, dropout of attention
+        proj_drop: float, dropout for output
+    """
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+
+
+    def forward(self, x):
+        B, N, C = x.shape
+        #qkv shape [3, N, num_head, HW, C//num_head]
+        qkv = self.qkv(x).reshape([B, N, 3, self.num_heads, C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]   # [N, num_head, HW, C//num_head]
+        attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Attention_Decoder(nn.Layer):
+    """Attention Decoder Implement
+
+    Attributes:
+        dim: int, input dimension (channels)
+        num_heads: int, number of attention heads
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: False
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        attn_drop: float, dropout of attention
+        proj_drop: float, dropout for output
+    """
+    def __init__(self, dim, num_heads=1, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.fc_q = nn.Linear(dim, dim * 1, bias_attr=qkv_bias)
+        self.fc_kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+
+    def forward(self, q, x):
+        # q:[B,12,256] x:[B,HW,256]
+        B, N, C = x.shape
+        n_class = q.shape[1]
+
+        q = self.fc_q(q).reshape([B, self.num_heads, n_class, C // self.num_heads])
+        kv = self.fc_kv(x).reshape([B, N, 2, self.num_heads, C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        k, v = kv[0], kv[1] # [B, num_head, HW, 256/num_head]
+
+        attn1 = (q @ k.transpose([0, 1, 3, 2])) * self.scale #[B, num_head, 12, HW]
+        attn2 = F.softmax(attn1, axis=-1)
+        attn3 = self.attn_drop(attn2) #[B, num_head, 11, HW]
+
+
+        x = (attn3 @ v).reshape([B, n_class, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)  # [B, 12, 256]
+
+        attn = attn1.transpose([0, 2, 1, 3])
+
+        return attn, x
+
+
+class Block_Encoder(nn.Layer):
+    """Block Encoder Implement
+    
+    consists of a multi-head self-attention module and a feed forward network 
+
+    Attributes:
+        dim: int, input dimension (channels)
+        num_heads: int, number of attention heads
+        mlp_ratio: float, ratio of mlp hidden dim and input embedding dim, default: 4.
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: False
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        drop: dropout rate for Mlp module
+        attn_drop: float, dropout of attention
+        drop_path: drop path for stochastic depth
+        act_layer: activation layer type, default: nn.GELU
+        norm_layer: normalization layer type, default: nn.LayerNorm
+    """
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention_Encoder(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, dropout=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class Block_Decoder(nn.Layer):
+    """Block Decoder Implement
+
+    Attributes:
+        dim: int, input dimension (channels)
+        num_heads: int, number of attention heads
+        feat_HxW: control Mlp in_features dim
+        mlp_ratio: float, ratio of mlp hidden dim and input embedding dim, default: 4.
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: False
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        drop: float, dropout rate for Mlp module, default: 0.
+        attn_drop: float, dropout rate of attention, default: 0.
+        drop_path: float, drop path for stochastic depth, default: 0.
+        act_layer: activation layer type, default: nn.GELU
+        norm_layer: normalization layer type, default: nn.LayerNorm
+    """
+    def __init__(self, dim, num_heads, feat_HxW, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0.,
+                 attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.norm1_clsembed = norm_layer(dim)
+
+        self.attn = Attention_Decoder(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        self.norm3 = norm_layer(dim)
+        self.norm4 = norm_layer(1024)
+
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, dropout=drop)
+        self.mlp2 = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, dropout=drop)
+        self.mlp3 = Mlp(in_features=feat_HxW, hidden_features=feat_HxW*3, dropout=drop)
+
+    def forward(self, query, feat):
+        # query:[B,12,256] feat:[B,12,HW]
+        attn, query = self.attn(self.norm1_clsembed(query), self.norm1(feat))
+        query = query + self.drop_path(query)
+        query = query + self.drop_path(self.mlp(self.norm2(query)))
+
+        feat = feat + self.drop_path(feat)
+        feat = feat + self.drop_path(self.mlp2(self.norm3(feat)))
+
+        attn = attn + self.drop_path(attn)
+        attn = attn + self.drop_path(self.mlp3(self.norm4(attn)))
+
+        return attn, query, feat
+
+
+class TransformerEncoder(nn.Layer):
+    """Transformer Encoder Implement
+
+    Attributes:
+        embed_dim: int, embedding dimension, embed_dim: 768
+        depth: int, nums of Block_Encoder, default: 12
+        num_patches: int, pos_embed dim, default: 32*32
+        num_heads: int, number of attention heads, default: 12
+        mlp_ratio: float, ratio of mlp hidden dim and input embedding dim, default: 4.
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: False
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        drop_rate: float, rate of dropout, default: 0
+        drop_path_rate: in order to implement stochastic depth decay rule, default: 0.
+        attn_drop_rate: float, dropout rate of attention
+        norm_layer: normalization layer type, default: nn.LayerNorm
+    """
+    def __init__(self, embed_dim=768, depth=12, num_patches=32*32, num_heads=12, mlp_ratio=4., qkv_bias=False,
+                 qk_scale=None, drop_rate=0., drop_path_rate=0., attn_drop_rate=0., norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.cls_token = paddle.create_parameter(shape=[1, 1, embed_dim], dtype='float32',
+                                    default_initializer=nn.initializer.Constant(0.0))
+        self.pos_embed = paddle.create_parameter(shape=[1, num_patches + 1, embed_dim], dtype='float32',
+                                    default_initializer=nn.initializer.Constant(0.0))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        
+        dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks_encoder = nn.LayerList([
+            Block_Encoder(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+            for i in range(depth)])
+
+        self.norm = norm_layer(embed_dim)
+        
+        trunc_normal_(self.cls_token, std=.02)
+        trunc_normal_(self.pos_embed, std=.02)
+        self.apply(self._init_weights)
+    
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                m.bias = paddle.create_parameter(shape=m.bias.shape, dtype='float32',
+                                                 default_initializer=nn.initializer.Constant(value=0.0))
+        elif isinstance(m, nn.LayerNorm):
+            m.weight = paddle.create_parameter(shape=m.weight.shape, dtype='float32',
+                                               default_initializer=nn.initializer.Constant(value=1.0))
+            m.bias = paddle.create_parameter(shape=m.bias.shape, dtype='float32',
+                                             default_initializer=nn.initializer.Constant(value=0.0))
+
+    def resize_pos_embed(self, x, pos_embed):
+        if x.shape[1] == pos_embed.shape[1]:
+            return pos_embed
+
+        n, hw, c = x.shape
+        x_h = x_w = int(math.sqrt(hw-1))
+        assert x_h * x_w == hw-1
+
+        cls_pos_embed, feat_pos_embed = pos_embed[:,0:1,:], pos_embed[:,1:,:]
+        feat_h = feat_w = int(math.sqrt(feat_pos_embed.shape[1]))
+        assert feat_h * feat_w == feat_pos_embed.shape[1]
+        feat_pos_embed = feat_pos_embed.reshape([feat_pos_embed.shape[0], feat_h, feat_w, -1]).transpose([0,3,1,2]) #[n,c,h,w]
+        feat_pos_embed = F.interpolate(feat_pos_embed, (x_h, x_w), mode='bilinear', align_corners=True).transpose([0,2,3,1])\
+            .reshape([feat_pos_embed.shape[0],x_h*x_w, -1])
+
+        new_pos_embed = paddle.concat([cls_pos_embed, feat_pos_embed], axis=1)
+        assert new_pos_embed.shape[1] == x.shape[1]
+        return new_pos_embed
+
+    def forward_encoder(self, x):
+        B = x.shape[0]
+        cls_tokens = self.cls_token.expand([B, -1, -1])  # stole cls_tokens impl from Phil Wang, thanks
+        x = paddle.concat((cls_tokens, x), axis=1)
+
+        pos_embed = self.pos_embed
+        pos_embed = self.resize_pos_embed(x, pos_embed)
+        x = x + pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks_encoder:
+            x = blk(x)
+
+        x = self.norm(x)
+        return x[:, 0], x[:, 1:]
+
+
+class TransformerDecoder(nn.Layer):
+    """Transformer Decoder Implement
+
+    Attributes:
+        embed_dim: int, embedding dimension, embed_dim: 768
+        depth: int, nums of Block_Encoder, default: 12
+        decoder_feat_HxW: int, control Mlp in_features dim, default: 1024
+        num_heads: int, number of attention heads, default: 12
+        mlp_ratio: float, ratio of mlp hidden dim and input embedding dim, default: 4.
+        qkv_bias: bool, if True, enable learnable bias to q,k,v, default: False
+        qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+        drop_rate: float, rate of dropout, default: 0
+        drop_path_rate: in order to implement stochastic depth decay rule, default: 0.
+        attn_drop_rate: float, dropout rate of attention
+        norm_layer: normalization layer type, default: nn.LayerNorm
+    """
+    def __init__(self, embed_dim=768, depth=12, nclass=12, decoder_feat_HxW=1024, num_heads=12, mlp_ratio=4.,
+                 qkv_bias=False, qk_scale=None, drop_rate=0., drop_path_rate=0., attn_drop_rate=0., norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.cls_embed = paddle.create_parameter(shape=[1, nclass, embed_dim], dtype='float32',
+                                        default_initializer=nn.initializer.Constant(0.0))
+
+        dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks_decoder = nn.LayerList([
+            Block_Decoder(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, feat_HxW=decoder_feat_HxW, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+            for i in range(depth)])
+
+        trunc_normal_(self.cls_embed, std=.02)
+        self.apply(self._init_weights)
+    
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                m.bias = paddle.create_parameter(shape=m.bias.shape, dtype='float32',
+                                                 default_initializer=nn.initializer.Constant(value=0.0))
+        elif isinstance(m, nn.LayerNorm):
+            m.weight = paddle.create_parameter(shape=m.weight.shape, dtype='float32',
+                                               default_initializer=nn.initializer.Constant(value=1.0))
+            m.bias = paddle.create_parameter(shape=m.bias.shape, dtype='float32',
+                                             default_initializer=nn.initializer.Constant(value=0.0))
+
+    def forward_decoder(self, x):
+        attns_list = []
+        feat = x
+        B = feat.shape[0]
+
+        for idx, blk in enumerate(self.blocks_decoder):
+            if idx == 0:
+                query = self.cls_embed.expand([B, -1, -1])
+            else:
+                query += self.cls_embed.expand([B, -1, -1])
+            attn, query, feat = blk(query, feat)
+            attns_list.append(attn)
+
+        return attns_list
diff --git a/semantic_segmentation/src/models/backbones/vit.py b/semantic_segmentation/src/models/backbones/vit.py
new file mode 100644
index 00000000..6094f013
--- /dev/null
+++ b/semantic_segmentation/src/models/backbones/vit.py
@@ -0,0 +1,307 @@
+"""
+Implement Transformer Class for ViT
+"""
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from src.utils import load_pretrained_model
+
+
+class Embeddings(nn.Layer):
+    """Patch Embeddings and Position Embeddings
+
+    Apply patch embeddings and position embeddings on input images.
+    Currently hybrid is not supported yet.
+
+    Attributes:
+        hybrid: TODO.
+        patch_embddings: impl using a patch_size x patch_size Conv2D operation
+        position_embddings: a parameter with len = num_patch + 1(for cls_token)
+        cls_token: token insert to the patch feature for classification
+        dropout: dropout for embeddings
+    """
+
+    def __init__(self, config, in_channels=3):
+        super(Embeddings, self).__init__()
+        self.hybrid = config.MODEL.TRANS.HYBRID
+        image_size = config.DATA.CROP_SIZE
+        self.keep_cls_token = config.MODEL.TRANS.KEEP_CLS_TOKEN
+        if self.hybrid:
+            #TODO: add resnet model 
+            self.hybrid_model = None
+
+        if config.MODEL.TRANS.PATCH_GRID is not None:
+            self.hybrid = True
+            grid_size = config.MODEL.TRANS.PATCH_GRID
+            patch_size = (image_size[0] // 16 // grid_size, image_size[1] // 16 // grid_size)
+            n_patches = (image_size[0] // 16) * (image_size[1] // 16)
+        else:
+            self.hybrid = False
+            patch_size = config.MODEL.TRANS.PATCH_SIZE
+            n_patches = (image_size[0] // patch_size) * (image_size[1] // patch_size)
+        
+        self.patch_embeddings = nn.Conv2D(in_channels=in_channels,
+                                          out_channels=config.MODEL.TRANS.HIDDEN_SIZE,
+                                          kernel_size=patch_size,
+                                          stride=patch_size)
+
+        self.position_embeddings = paddle.create_parameter(
+                                    shape=[1, n_patches+1, config.MODEL.TRANS.HIDDEN_SIZE],
+                                    dtype='float32',
+                                    default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) # may be important
+
+        self.cls_token = paddle.create_parameter(
+                                    shape=[1, 1, config.MODEL.TRANS.HIDDEN_SIZE],
+                                    dtype='float32',
+                                    default_initializer=paddle.nn.initializer.Constant(0))
+
+        self.dropout = nn.Dropout(config.MODEL.DROPOUT)
+
+    def forward(self, x):
+        cls_tokens = self.cls_token[0].expand((x.shape[0], -1, -1))
+        if self.hybrid:
+            # x = self.hybrid_model(x)  # TODO
+            pass
+        x = self.patch_embeddings(x)
+        x = x.flatten(2)
+        x = x.transpose([0, 2, 1])
+        x = paddle.concat((cls_tokens, x), axis=1)
+        embeddings = x + self.position_embeddings[0] # tensor broadcast
+        if not self.keep_cls_token:
+            embeddings = embeddings[:, 1:]  # For SETR 
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class Attention(nn.Layer):
+    """ Attention module
+
+    Attention module for ViT, here q, k, v are assumed the same.
+    The qkv mappings are stored as one single param.
+
+    Attributes:
+        num_heads: number of heads
+        attn_head_size: feature dim of single head
+        all_head_size: feature dim of all heads
+        qkv: a nn.Linear for q, k, v mapping
+        scales: 1 / sqrt(single_head_feature_dim)
+        out: projection of multi-head attention
+        attn_dropout: dropout for attention
+        proj_dropout: final dropout before output
+        softmax: softmax op for attention
+    """
+
+    def __init__(self, config):
+        super(Attention, self).__init__()
+        self.num_heads = config.MODEL.TRANS.NUM_HEADS
+        self.attn_head_size = int(config.MODEL.TRANS.HIDDEN_SIZE / self.num_heads)
+        self.all_head_size = self.attn_head_size * self.num_heads
+
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.qkv = nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE,
+                             self.all_head_size*3,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1 if config.MODEL.TRANS.QKV_BIAS else False)
+
+        self.scales = self.attn_head_size ** -0.5  # 0.125 for Large
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.out = nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE,
+                             config.MODEL.TRANS.HIDDEN_SIZE,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+
+        self.attn_dropout = nn.Dropout(config.MODEL.ATTENTION_DROPOUT)
+        self.proj_dropout = nn.Dropout(config.MODEL.DROPOUT)
+
+        self.softmax = nn.Softmax(axis=-1)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+
+    def transpose_multihead(self, x):
+        new_shape = x.shape[:-1] + [self.num_heads, self.attn_head_size]
+        x = x.reshape(new_shape)
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def forward(self, x):
+        qkv = self.qkv(x).chunk(3, axis=-1)
+        q, k, v = map(self.transpose_multihead, qkv)
+
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = attn * self.scales
+        attn = self.softmax(attn)
+        attn_weights = attn 
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 2, 1, 3])
+        new_shape = z.shape[:-2] + [self.all_head_size]
+        z = z.reshape(new_shape)
+        # reshape 
+        z = self.out(z)
+        z = self.proj_dropout(z)
+        return z, attn_weights
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+
+    def __init__(self, config):
+        super(Mlp, self).__init__()
+
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE,
+                             int(config.MODEL.TRANS.MLP_RATIO * config.MODEL.TRANS.HIDDEN_SIZE),
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(int(config.MODEL.TRANS.MLP_RATIO * config.MODEL.TRANS.HIDDEN_SIZE),
+                             config.MODEL.TRANS.HIDDEN_SIZE,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU() 
+        self.dropout1 = nn.Dropout(config.MODEL.DROPOUT)
+        #self.dropout2 = nn.Dropout(config.MODEL.DROPOUT)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(
+                initializer=paddle.nn.initializer.XavierUniform()) #default in pp: xavier
+        bias_attr = paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Normal(std=1e-6)) #default in pp: zero
+        
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout1(x)
+        x = self.fc2(x)
+        x = self.dropout1(x)
+        return x
+
+
+class EncoderLayer(nn.Layer):
+    """Encoder Layer
+
+    Encoder layer contains attention, norm, mlp and residual
+
+    Attributes:
+        hidden_size: transformer feature dim
+        attn_norm: nn.LayerNorm before attention
+        mlp_norm: nn.LayerNorm before mlp
+        mlp: mlp modual
+        attn: attention modual
+    """
+
+    def __init__(self, config):
+        super(EncoderLayer, self).__init__()
+        self.hidden_size = config.MODEL.TRANS.HIDDEN_SIZE
+        self.attn_norm = nn.LayerNorm(config.MODEL.TRANS.HIDDEN_SIZE, epsilon=1e-6)
+        self.mlp_norm = nn.LayerNorm(config.MODEL.TRANS.HIDDEN_SIZE, epsilon=1e-6)
+        self.mlp = Mlp(config)
+        self.attn = Attention(config)
+
+    def forward(self, x):
+        h = x
+        x = self.attn_norm(x)
+        x, attn = self.attn(x)
+        x = x + h 
+
+        h = x
+        x = self.mlp_norm(x)
+        x = self.mlp(x)
+        x = x + h
+
+        return x, attn
+
+
+class Encoder(nn.Layer):
+    """Encoder
+
+    Encoder contains a list of EncoderLayer, and a LayerNorm at the end.
+
+    Attributes:
+        layers: nn.LayerList contains multiple EncoderLayers
+        encoder_norm: nn.LayerNorm which is applied after last encoder layer
+    """
+
+    def __init__(self, config):
+        super(Encoder, self).__init__()
+        self.layers = nn.LayerList([EncoderLayer(config) for _ in range(config.MODEL.TRANS.NUM_LAYERS)])
+        #self.encoder_norm = nn.LayerNorm(config.MODEL.TRANS.HIDDEN_SIZE, epsilon=1e-6)
+        self.out_idx_list = tuple(range(config.MODEL.TRANS.NUM_LAYERS))
+
+    def forward(self, x):
+        self_attn = []
+        outs = []
+        for layer_idx, layer in enumerate(self.layers):
+            x, attn = layer(x)
+            self_attn.append(attn)
+            if layer_idx in self.out_idx_list:
+                outs.append(x)
+        #out = self.encoder_norm(x)
+        return outs
+
+
+class Transformer(nn.Layer):
+    """Transformer
+
+    Attributes:
+        embeddings: patch embeddings and position embeddings
+        encoder: encoder layers with multihead self attention
+    """
+
+    def __init__(self, config):
+        super(Transformer, self).__init__()
+        self.embeddings = Embeddings(config)
+        self.dropout = nn.Dropout(config.MODEL.DROPOUT)
+        self.encoder = Encoder(config)
+
+    def forward(self, x):
+        embedding_out = self.embeddings(x)
+        embedding_out = self.dropout(embedding_out)
+        encoder_outs = self.encoder(embedding_out)
+        return encoder_outs
+
+class VisualTransformer(nn.Layer):
+    """ VisualTransformer
+   
+    Vision Transformer as the backbone of SETR-PUP and SETR-Naive. 
+    Ref. https://arxiv.org/pdf/2012.15840.pdf
+
+    """
+    def __init__(self, config):
+        super(VisualTransformer, self).__init__()
+        self.transformer = Transformer(config)
+        self.out_indices = config.MODEL.ENCODER.OUT_INDICES
+        norm_weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0))
+        norm_bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0))
+        if config.MODEL.PRETRAINED is not None:
+            load_pretrained_model(self, config.MODEL.PRETRAINED)
+
+    def forward(self, x):
+        B = x.shape[0]
+        outs = self.transformer(x)
+        feats = [ ]
+        # getting multi-level feats from layers
+        for idx in self.out_indices:
+            feats.append(outs[idx])
+        return feats 
+
diff --git a/semantic_segmentation/src/models/backbones/vit_mla.py b/semantic_segmentation/src/models/backbones/vit_mla.py
new file mode 100644
index 00000000..1dcff07b
--- /dev/null
+++ b/semantic_segmentation/src/models/backbones/vit_mla.py
@@ -0,0 +1,433 @@
+"""
+Implement Transformer Class for ViT_MLA
+"""
+
+import math
+import paddle
+import paddle.nn as nn
+from src.utils import load_pretrained_model
+
+
+class Embeddings(nn.Layer):
+    """Patch Embeddings and Position Embeddings
+
+    Apply patch embeddings and position embeddings on input images.
+    Currently hybrid is not supported yet.
+
+    Attributes:
+        hybrid: TODO.
+        patch_embddings: impl using a patch_size x patch_size Conv2D operation
+        position_embddings: a parameter with len = num_patch + 1(for cls_token)
+        cls_token: token insert to the patch feature for classification
+        dropout: dropout for embeddings
+    """
+
+    def __init__(self, config, in_channels=3):
+        super(Embeddings, self).__init__()
+        self.hybrid = config.MODEL.TRANS.HYBRID
+        image_size = config.DATA.CROP_SIZE
+
+        if self.hybrid:
+            #TODO: add resnet model 
+            self.hybrid_model = None
+
+        if config.MODEL.TRANS.PATCH_GRID is not None:
+            self.hybrid = True
+            grid_size = config.MODEL.TRANS.PATCH_GRID
+            patch_size = (image_size[0] // 16 // grid_size, image_size[1] // 16 // grid_size)
+            n_patches = (image_size[0] // 16) * (image_size[1] // 16)
+        else:
+            self.hybrid = False
+            patch_size = config.MODEL.TRANS.PATCH_SIZE
+            n_patches = (image_size[0] // patch_size) * (image_size[1] // patch_size)
+        
+        self.patch_embeddings = nn.Conv2D(in_channels=in_channels,
+                                          out_channels=config.MODEL.TRANS.HIDDEN_SIZE,
+                                          kernel_size=patch_size,
+                                          stride=patch_size)
+
+        self.position_embeddings = paddle.create_parameter(
+                                    shape=[1, n_patches+1, config.MODEL.TRANS.HIDDEN_SIZE],
+                                    dtype='float32',
+                                    default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) # may be important
+
+        self.cls_token = paddle.create_parameter(
+                                    shape=[1, 1, config.MODEL.TRANS.HIDDEN_SIZE],
+                                    dtype='float32',
+                                    default_initializer=paddle.nn.initializer.Constant(0))
+
+        self.dropout = nn.Dropout(config.MODEL.DROPOUT)
+
+    def forward(self, x):
+        cls_tokens = self.cls_token[0].expand((x.shape[0], -1, -1))
+        if self.hybrid:
+            # x = self.hybrid_model(x)  # TODO
+            pass
+        x = self.patch_embeddings(x)
+        x = x.flatten(2)
+        x = x.transpose([0, 2, 1])
+        x = paddle.concat((cls_tokens, x), axis=1)
+        embeddings = x + self.position_embeddings[0] # tensor broadcast
+        embeddings = embeddings[:, 1:]  # For SETR 
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class Attention(nn.Layer):
+    """ Attention module
+
+    Attention module for ViT, here q, k, v are assumed the same.
+    The qkv mappings are stored as one single param.
+
+    Attributes:
+        num_heads: number of heads
+        attn_head_size: feature dim of single head
+        all_head_size: feature dim of all heads
+        qkv: a nn.Linear for q, k, v mapping
+        scales: 1 / sqrt(single_head_feature_dim)
+        out: projection of multi-head attention
+        attn_dropout: dropout for attention
+        proj_dropout: final dropout before output
+        softmax: softmax op for attention
+    """
+
+    def __init__(self, config):
+        super(Attention, self).__init__()
+        self.num_heads = config.MODEL.TRANS.NUM_HEADS
+        self.attn_head_size = int(config.MODEL.TRANS.HIDDEN_SIZE / self.num_heads)
+        self.all_head_size = self.attn_head_size * self.num_heads
+
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.qkv = nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE,
+                             self.all_head_size*3,
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1 if config.MODEL.TRANS.QKV_BIAS else False)
+
+        self.scales = self.attn_head_size ** -0.5  # 0.125 for Large
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.out = nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE,
+                             config.MODEL.TRANS.HIDDEN_SIZE,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+
+        self.attn_dropout = nn.Dropout(config.MODEL.ATTENTION_DROPOUT)
+        self.proj_dropout = nn.Dropout(config.MODEL.DROPOUT)
+
+        self.softmax = nn.Softmax(axis=-1)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+
+    def transpose_multihead(self, x):
+        new_shape = x.shape[:-1] + [self.num_heads, self.attn_head_size]
+        x = x.reshape(new_shape)
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def forward(self, x):
+        qkv = self.qkv(x).chunk(3, axis=-1)
+        q, k, v = map(self.transpose_multihead, qkv)
+
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = attn * self.scales
+        attn = self.softmax(attn)
+        attn_weights = attn 
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 2, 1, 3])
+        new_shape = z.shape[:-2] + [self.all_head_size]
+        z = z.reshape(new_shape)
+        # reshape 
+        z = self.out(z)
+        z = self.proj_dropout(z)
+        return z, attn_weights
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+
+    def __init__(self, config):
+        super(Mlp, self).__init__()
+
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE,
+                             int(config.MODEL.TRANS.MLP_RATIO * config.MODEL.TRANS.HIDDEN_SIZE),
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(int(config.MODEL.TRANS.MLP_RATIO * config.MODEL.TRANS.HIDDEN_SIZE),
+                             config.MODEL.TRANS.HIDDEN_SIZE,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU() 
+        self.dropout1 = nn.Dropout(config.MODEL.DROPOUT)
+        #self.dropout2 = nn.Dropout(config.MODEL.DROPOUT)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(
+                initializer=paddle.nn.initializer.XavierUniform()) #default in pp: xavier
+        bias_attr = paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Normal(std=1e-6)) #default in pp: zero
+        
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout1(x)
+        x = self.fc2(x)
+        x = self.dropout1(x)
+        return x
+
+
+class EncoderLayer(nn.Layer):
+    """Encoder Layer
+
+    Encoder layer contains attention, norm, mlp and residual
+
+    Attributes:
+        hidden_size: transformer feature dim
+        attn_norm: nn.LayerNorm before attention
+        mlp_norm: nn.LayerNorm before mlp
+        mlp: mlp modual
+        attn: attention modual
+    """
+
+    def __init__(self, config):
+        super(EncoderLayer, self).__init__()
+        self.hidden_size = config.MODEL.TRANS.HIDDEN_SIZE
+        self.attn_norm = nn.LayerNorm(config.MODEL.TRANS.HIDDEN_SIZE, epsilon=1e-6)
+        self.mlp_norm = nn.LayerNorm(config.MODEL.TRANS.HIDDEN_SIZE, epsilon=1e-6)
+        self.mlp = Mlp(config)
+        self.attn = Attention(config)
+
+    def forward(self, x):
+        h = x
+        x = self.attn_norm(x)
+        x, attn = self.attn(x)
+        x = x + h 
+
+        h = x
+        x = self.mlp_norm(x)
+        x = self.mlp(x)
+        x = x + h
+
+        return x, attn
+
+
+class Encoder(nn.Layer):
+    """Encoder
+
+    Encoder contains a list of EncoderLayer, and a LayerNorm at the end.
+
+    Attributes:
+        layers: nn.LayerList contains multiple EncoderLayers
+        encoder_norm: nn.LayerNorm which is applied after last encoder layer
+    """
+
+    def __init__(self, config):
+        super(Encoder, self).__init__()
+        self.layers = nn.LayerList([EncoderLayer(config) for _ in range(config.MODEL.TRANS.NUM_LAYERS)])
+        #self.encoder_norm = nn.LayerNorm(config.MODEL.TRANS.HIDDEN_SIZE, epsilon=1e-6)
+        self.out_idx_list = tuple(range(config.MODEL.TRANS.NUM_LAYERS))
+
+    def forward(self, x):
+        self_attn = []
+        outs = []
+        for layer_idx, layer in enumerate(self.layers):
+            x, attn = layer(x)
+            self_attn.append(attn)
+            if layer_idx in self.out_idx_list:
+                outs.append(x)
+        #out = self.encoder_norm(x)
+        return outs
+
+
+class Transformer(nn.Layer):
+    """Transformer
+
+    Attributes:
+        embeddings: patch embeddings and position embeddings
+        encoder: encoder layers with multihead self attention
+    """
+
+    def __init__(self, config):
+        super(Transformer, self).__init__()
+        self.embeddings = Embeddings(config)
+        self.dropout = nn.Dropout(config.MODEL.DROPOUT)
+        self.encoder = Encoder(config)
+
+    def forward(self, x):
+        embedding_out = self.embeddings(x)
+        embedding_out = self.dropout(embedding_out)
+        encoder_outs = self.encoder(embedding_out)
+        
+        return encoder_outs
+
+
+
+class Conv_MLA(nn.Layer):
+    """Conv_MLA
+
+    Multi-Level feature Aggregatio, Ref. https://arxiv.org/pdf/2012.15840.pdf
+
+    """
+    def __init__(self, in_channels=1024, mla_channels=256):
+        super(Conv_MLA, self).__init__()
+        norm_bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) 
+        self.mla_p2_1x1 = nn.Sequential(
+            nn.Conv2D(in_channels, mla_channels, 1, bias_attr=False),
+            nn.SyncBatchNorm(
+                mla_channels, 
+                weight_attr=self.get_norm_weight_attr(), 
+                bias_attr=norm_bias_attr),
+            nn.ReLU())
+        self.mla_p3_1x1 = nn.Sequential(
+            nn.Conv2D(in_channels, mla_channels, 1, bias_attr=False),
+            nn.SyncBatchNorm(
+                mla_channels, 
+                weight_attr=self.get_norm_weight_attr(), 
+                bias_attr=norm_bias_attr),
+            nn.ReLU())
+        self.mla_p4_1x1 = nn.Sequential(
+            nn.Conv2D(in_channels, mla_channels, 1, bias_attr=False),
+            nn.SyncBatchNorm(
+                mla_channels, 
+                weight_attr=self.get_norm_weight_attr(), 
+                bias_attr=norm_bias_attr),
+            nn.ReLU())
+        self.mla_p5_1x1 = nn.Sequential(
+            nn.Conv2D(in_channels, mla_channels, 1, bias_attr=False),
+            nn.SyncBatchNorm(
+                mla_channels, 
+                weight_attr=self.get_norm_weight_attr(), 
+                bias_attr=norm_bias_attr),
+            nn.ReLU())
+
+        self.mla_p2 = nn.Sequential(
+            nn.Conv2D(mla_channels, mla_channels, 3, padding=1, bias_attr=False),
+            nn.SyncBatchNorm(
+                mla_channels, 
+                weight_attr=self.get_norm_weight_attr(), 
+                bias_attr=norm_bias_attr),
+            nn.ReLU())
+        self.mla_p3 = nn.Sequential(
+            nn.Conv2D(mla_channels, mla_channels, 3, padding=1, bias_attr=False),
+            nn.SyncBatchNorm(
+                mla_channels, 
+                weight_attr=self.get_norm_weight_attr(), 
+                bias_attr=norm_bias_attr),
+            nn.ReLU())
+        self.mla_p4 = nn.Sequential(
+            nn.Conv2D(mla_channels, mla_channels, 3, padding=1, bias_attr=False),
+            nn.SyncBatchNorm(
+                mla_channels, 
+                weight_attr=self.get_norm_weight_attr(), 
+                bias_attr=norm_bias_attr),
+            nn.ReLU())
+        self.mla_p5 = nn.Sequential(
+            nn.Conv2D(mla_channels, mla_channels, 3, padding=1, bias_attr=False),
+            nn.SyncBatchNorm(
+                mla_channels,
+                weight_attr=self.get_norm_weight_attr(), 
+                bias_attr=norm_bias_attr),
+            nn.ReLU())
+
+    def get_norm_weight_attr(self):
+        return paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) 
+
+    def to_2D(self, x):
+        n, hw, c = x.shape
+        h = w = int(math.sqrt(hw))
+        x = x.transpose([0, 2, 1]).reshape([n, c, h, w])
+        return x
+
+    def forward(self, res2, res3, res4, res5):
+        res2 = self.to_2D(res2)
+        res3 = self.to_2D(res3)
+        res4 = self.to_2D(res4)
+        res5 = self.to_2D(res5)
+
+        mla_p5_1x1 = self.mla_p5_1x1(res5)
+        mla_p4_1x1 = self.mla_p4_1x1(res4)
+        mla_p3_1x1 = self.mla_p3_1x1(res3)
+        mla_p2_1x1 = self.mla_p2_1x1(res2)
+
+        mla_p4_plus = mla_p5_1x1 + mla_p4_1x1
+        mla_p3_plus = mla_p4_plus + mla_p3_1x1
+        mla_p2_plus = mla_p3_plus + mla_p2_1x1
+
+        mla_p5 = self.mla_p5(mla_p5_1x1)
+        mla_p4 = self.mla_p4(mla_p4_plus)
+        mla_p3 = self.mla_p3(mla_p3_plus)
+        mla_p2 = self.mla_p2(mla_p2_plus)
+
+        return mla_p2, mla_p3, mla_p4, mla_p5
+
+
+class ViT_MLA(nn.Layer):
+    """ ViT_MLA
+   
+    Vision Transformer with MLA (ViT_MLA) as the backbone of SETR-MLA. 
+    Ref. https://arxiv.org/pdf/2012.15840.pdf
+
+    """
+    def __init__(self, config):
+        super(ViT_MLA, self).__init__()
+        self.transformer = Transformer(config)
+        self.mla = Conv_MLA(in_channels=config.MODEL.TRANS.HIDDEN_SIZE, mla_channels=config.MODEL.MLA.MLA_CHANNELS)
+        self.mla_index = config.MODEL.ENCODER.OUT_INDICES
+
+        norm_weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0))
+        norm_bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0))
+        self.norm_0 = nn.LayerNorm(config.MODEL.TRANS.HIDDEN_SIZE, epsilon=1e-06, 
+                                   weight_attr=norm_weight_attr, bias_attr=norm_bias_attr)
+        self.norm_1 = nn.LayerNorm(config.MODEL.TRANS.HIDDEN_SIZE, epsilon=1e-06, 
+                                   weight_attr=norm_weight_attr, bias_attr=norm_bias_attr)
+        self.norm_2 = nn.LayerNorm(config.MODEL.TRANS.HIDDEN_SIZE, epsilon=1e-06, 
+                                   weight_attr=norm_weight_attr, bias_attr=norm_bias_attr)
+        self.norm_3 = nn.LayerNorm(config.MODEL.TRANS.HIDDEN_SIZE, epsilon=1e-06, 
+                                   weight_attr=norm_weight_attr, bias_attr=norm_bias_attr)
+
+        if config.MODEL.PRETRAINED is not None:
+            load_pretrained_model(self, config.MODEL.PRETRAINED)
+            # TODO: whether set the learning rate coef of Conv_MLA module as config.TRAIN.DECODER_LR_COEF (default: 1)
+            # print("init learning rate coef for parital encoder (Conv_MLA)")
+            """
+            for sublayer in self.mla.sublayers():
+                if isinstance(sublayer, nn.Conv2D):
+                    sublayer.weight.optimize_attr['learning_rate'] = config.TRAIN.DECODER_LR_COEF
+                    if sublayer.bias is not None:
+                        sublayer.bias.optimize_attr['learning_rate'] = config.TRAIN.DECODER_LR_COEF
+                if isinstance(sublayer, nn.SyncBatchNorm) or isinstance(sublayer, nn.BatchNorm2D) or isinstance(sublayer,nn.LayerNorm):
+                    # set lr coef
+                    sublayer.weight.optimize_attr['learning_rate'] = config.TRAIN.DECODER_LR_COEF
+                    sublayer.bias.optimize_attr['learning_rate'] = config.TRAIN.DECODER_LR_COEF
+
+            """      
+
+    def forward(self, x):
+        outs = self.transformer(x)
+        c6 = self.norm_0(outs[self.mla_index[0]])
+        c12 = self.norm_1(outs[self.mla_index[1]])
+        c18 = self.norm_2(outs[self.mla_index[2]])
+        c24 = self.norm_3(outs[self.mla_index[3]])
+        mla_p2, mla_p3, mla_p4, mla_p5 = self.mla(c6, c12, c18, c24)
+        return [mla_p2, mla_p3, mla_p4, mla_p5]
+
diff --git a/semantic_segmentation/src/models/decoders/__init__.py b/semantic_segmentation/src/models/decoders/__init__.py
new file mode 100644
index 00000000..b87e7833
--- /dev/null
+++ b/semantic_segmentation/src/models/decoders/__init__.py
@@ -0,0 +1,9 @@
+from .vit_mla_head import VIT_MLAHead
+from .vit_mla_auxi_head import VIT_MLA_AUXIHead
+from .vit_up_head import VisionTransformerUpHead
+from .uper_head import UperHead
+from .fcn_head import FCNHead
+from .dpt_head import DPTHead
+from .segmentor_head import MaskTransformer
+from .segmentor_head import LinearDecoder
+from .trans2seg_head import *
diff --git a/semantic_segmentation/src/models/decoders/dpt_head.py b/semantic_segmentation/src/models/decoders/dpt_head.py
new file mode 100644
index 00000000..2f102de4
--- /dev/null
+++ b/semantic_segmentation/src/models/decoders/dpt_head.py
@@ -0,0 +1,278 @@
+import copy
+import paddle
+import paddle.nn as nn
+
+
+def readout_oper(config):
+    """get the layer to process the feature asnd the cls token
+    """
+    class Drop(object):
+        """drop class
+        just drop the cls token
+        """
+        def __init__(self, config):
+            if 'ViT' in config.MODEL.ENCODER.TYPE:
+                self.token_num = 1
+            elif 'DeiT' in config.MODEL.ENCODER.TYPE:
+                self.token_num = 2
+            self.feature_size = (config.DATA.CROP_SIZE[0] // config.MODEL.TRANS.PATCH_SIZE,
+                                config.DATA.CROP_SIZE[1] // config.MODEL.TRANS.PATCH_SIZE)
+
+        def __call__(self, x):
+            x = x[:, self.token_num:]
+            x = x.transpose((0, 2, 1))
+            x = x.reshape((x.shape[0], x.shape[1], self.feature_size[0], self.feature_size[1]))
+            return x
+
+
+    class Add(object):
+        """add class
+        add the cls token
+        """
+        def __init__(self, config):
+            if 'ViT' in config.MODEL.ENCODER.TYPE:
+                self.token_num = 1
+            elif 'DeiT' in config.MODEL.ENCODER.TYPE:
+                self.token_num = 2
+            self.feature_size = (config.DATA.CROP_SIZE[0] // config.MODEL.TRANS.PATCH_SIZE,
+                                config.DATA.CROP_SIZE[1] // config.MODEL.TRANS.PATCH_SIZE)
+
+        def __call__(self, x):
+            token = x[:, :self.token_num]
+            token = paddle.sum(token, axis=1).unsqueeze(1)
+            x = x[:, self.token_num:]
+            x = x + token
+            x = x.transpose((0, 2, 1))
+            x = x.reshape((x.shape[0], x.shape[1], self.feature_size[0], self.feature_size[1]))
+            return x
+
+    class Proj(nn.Layer):
+        """porject class
+        use a linear layer to confuse the feature and the cls token
+        """
+        def __init__(self, config):
+            super(Proj, self).__init__()
+            if 'ViT' in config.MODEL.ENCODER.TYPE:
+                self.token_num = 1
+            elif 'DeiT' in config.MODEL.ENCODER.TYPE:
+                self.token_num = 2
+            self.feature_size = (config.DATA.CROP_SIZE[0] // config.MODEL.TRANS.PATCH_SIZE,
+                                config.DATA.CROP_SIZE[1] // config.MODEL.TRANS.PATCH_SIZE)
+            self.proj = nn.Sequential(
+                nn.Linear(2 * config.MODEL.TRANS.HIDDEN_SIZE, config.MODEL.TRANS.HIDDEN_SIZE),
+                nn.GELU()
+            )
+
+        def forward(self, x):
+            token = x[:, :self.token_num]
+            token = paddle.sum(token, axis=1).unsqueeze(1)
+            x = x[:, self.token_num:]
+            token = token.expand_as(x)
+            x = paddle.concat([x, token], axis=-1)
+            x = self.proj(x)
+            x = x.transpose((0, 2, 1))
+            x = x.reshape((x.shape[0], x.shape[1], self.feature_size[0], self.feature_size[1]))
+            return x
+
+    if config.MODEL.DPT.READOUT_PROCESS == 'drop':
+        return [copy.deepcopy(Drop(config)) for _ in range(4)]
+    if config.MODEL.DPT.READOUT_PROCESS == 'add':
+        return [copy.deepcopy(Add(config)) for _ in range(4)]
+    if config.MODEL.DPT.READOUT_PROCESS =='project':
+        return nn.LayerList([copy.deepcopy(Proj(config)) for _ in range(4)])
+    return None
+
+class ResidualBLock(nn.Layer):
+    """Residual block
+    """
+    def __init__(self, channels, bn=True, act=nn.ReLU):
+        super(ResidualBLock, self).__init__()
+        self.bn = bn
+        self.conv1 = nn.Conv2D(channels, channels, 3, 1, 1, bias_attr=not self.bn)
+        self.conv2 = nn.Conv2D(channels, channels, 3, 1, 1, bias_attr=not self.bn)
+        if bn:
+            self.bn1 = nn.BatchNorm2D(channels)
+            self.bn2 = nn.BatchNorm2D(channels)
+        self.act=act()
+
+    def forward(self, inputs):
+        x = self.act(inputs)
+        x = self.conv1(x)
+        if self.bn:
+            x=self.bn1(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        if self.bn:
+            x = self.bn2(x)
+        return inputs+x
+
+
+class FeatureFusionBlock(nn.Layer):
+    """Feature fusion block
+    """
+    def __init__(self, channels, act, bn=True, expand=True, align_corners=True, outconv=True):
+        super(FeatureFusionBlock, self).__init__()
+        self.align_corners = align_corners
+        self.expand = expand
+        out_channels = channels // 2 if expand else channels
+
+        self.out_conv = nn.Conv2D(channels, out_channels, 1, 1, 0, bias_attr=True)if outconv else None
+        self.resblock1 = ResidualBLock(channels, bn, act)
+        self.resblock2 = ResidualBLock(channels, bn, act)
+
+
+    def forward(self, feature, x):
+        if x is not None:
+            x += self.resblock1(feature)
+        else:
+            x = feature
+        x = self.resblock2(x)
+        x = nn.functional.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        if self.out_conv:
+            return self.out_conv(x)
+        return x
+
+
+class DPTHead(nn.Layer):
+    """DPTHead
+     
+    DPTHead is the decoder of the dense predition transformer (DPT), Ref https://arxiv.org/pdf/2103.13413.pdf.
+    Reference:                                                                                                                                                
+        Rene Ranftl, et al. *"Vision Transformers for Dense Prediction"*
+    """
+    def __init__(self, config):
+        super(DPTHead, self).__init__()
+        features=config.MODEL.DPT.FEATURES
+        self.readout_oper = readout_oper(config)
+        self.refine = nn.LayerList([
+            copy.deepcopy(FeatureFusionBlock(
+                channels=features,
+                act=nn.ReLU,
+                bn=True,
+                expand=False,
+                align_corners=True
+            ))for _ in range(4)
+        ])
+        self.layers_rn = get_scratch(config)
+        self.process = get_process(config)
+        self.head = nn.Sequential(
+            nn.Conv2D(features, features, 3, 1, 1, bias_attr=False),
+            nn.BatchNorm2D(features),
+            nn.ReLU(),
+            nn.Dropout2D(0.1),
+            nn.Conv2D(features, config.DATA.NUM_CLASSES, 1),
+        )
+
+
+    def forward(self, inputs):
+        x = None
+        for i in range(3, -1, -1):
+            feature = self.readout_oper[i](inputs[i])
+            feature = self.process[i](feature)
+            feature = self.layers_rn[i](feature)
+            x = self.refine[i](feature, x)
+        x = self.head(x)
+        x = nn.functional.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        return [x,]
+
+
+def get_scratch(config, groups=1, expand=False):
+    """function to get the layer to make sure the features have the same dims
+    """
+    if expand:
+        out_shape = [config.MODEL.DPT.FEATURES * 2 ** i for i in range(4)]
+    else:
+        out_shape = [config.MODEL.DPT.FEATURES for _ in range(4)]
+    layers_rn=nn.LayerList()
+    for i in range(4):
+        layers_rn.append(nn.Conv2D(
+            config.MODEL.DPT.HIDDEN_FEATURES[i],
+            out_shape[i],
+            kernel_size=3,
+            stride=1,
+            padding='same',
+            bias_attr=False,
+            groups=groups
+        ))
+    return layers_rn
+
+def get_process(config):
+    """
+    function to get the layers to process the feature from the backbone
+    """
+    process = nn.LayerList()
+    process.append(
+        nn.Sequential(
+            nn.Conv2D(
+                in_channels=config.MODEL.TRANS.HIDDEN_SIZE,
+                out_channels=config.MODEL.DPT.HIDDEN_FEATURES[0],
+                kernel_size=1,
+                stride=1,
+                padding=0
+            ),
+            nn.Conv2DTranspose(
+                in_channels=config.MODEL.DPT.HIDDEN_FEATURES[0],
+                out_channels=config.MODEL.DPT.HIDDEN_FEATURES[0],
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                bias_attr=True,
+                dilation=1,
+                groups=1
+            )
+        )
+    )
+    process.append(
+        nn.Sequential(
+            nn.Conv2D(
+                in_channels=config.MODEL.TRANS.HIDDEN_SIZE,
+                out_channels=config.MODEL.DPT.HIDDEN_FEATURES[1],
+                kernel_size=1,
+                stride=1,
+                padding=0
+            ),
+            nn.Conv2DTranspose(
+                in_channels=config.MODEL.DPT.HIDDEN_FEATURES[1],
+                out_channels=config.MODEL.DPT.HIDDEN_FEATURES[1],
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias_attr=True,
+                dilation=1,
+                groups=1
+            )
+        )
+    )
+    process.append(
+        nn.Sequential(
+            nn.Conv2D(
+                in_channels=config.MODEL.TRANS.HIDDEN_SIZE,
+                out_channels=config.MODEL.DPT.HIDDEN_FEATURES[2],
+                kernel_size=1,
+                stride=1,
+                padding=0
+            )
+        )
+    )
+    process.append(
+        nn.Sequential(
+            nn.Conv2D(
+                in_channels=config.MODEL.TRANS.HIDDEN_SIZE,
+                out_channels=config.MODEL.DPT.HIDDEN_FEATURES[3],
+                kernel_size=1,
+                stride=1,
+                padding=0
+            ),
+            nn.Conv2D(
+                in_channels=config.MODEL.DPT.HIDDEN_FEATURES[3],
+                out_channels=config.MODEL.DPT.HIDDEN_FEATURES[3],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias_attr=True,
+                dilation=1,
+                groups=1
+            )
+        )
+    )
+    return process
diff --git a/semantic_segmentation/src/models/decoders/fcn_head.py b/semantic_segmentation/src/models/decoders/fcn_head.py
new file mode 100644
index 00000000..bd9cf0bc
--- /dev/null
+++ b/semantic_segmentation/src/models/decoders/fcn_head.py
@@ -0,0 +1,68 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+class FCNHead(nn.Layer):
+    """FCNHead
+
+    FCNHead is the decoder of FCN, which can also be used as an auxiliary segmentation head for other segmentation models.
+    Ref https://arxiv.org/pdf/1411.4038.pdf
+    Reference:                                                                                                                                                
+        Jonathan Long, et al. *"Fully Convolution Networks for Semantic Segmentation."*
+    """
+    def __init__(self, 
+                 in_channels=384, 
+                 channels=256, 
+                 num_convs=1, 
+                 concat_input=False, 
+                 dropout_ratio=0.1, 
+                 num_classes=60, 
+                 up_ratio = 16,
+                 align_corners=False):
+        super(FCNHead, self).__init__()   
+        self.in_channels = in_channels
+        self.channels = channels
+        self.num_convs = num_convs
+        self.concat_input = concat_input
+        self.dropout_ratio = dropout_ratio
+        self.num_classes = num_classes
+        self.up_ratio = up_ratio
+        self.align_corners = align_corners
+        if num_convs ==0:
+            assert self.in_channels == self.channels
+        norm_bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) 
+        convs = []
+        for i in range(self.num_convs):
+            in_channels = self.in_channels if i==0 else self.channels 
+            conv = nn.Sequential(
+                nn.Conv2D(in_channels, self.channels, kernel_size=3, stride=1, padding=1, bias_attr=False),
+                nn.SyncBatchNorm(self.channels, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr),
+                nn.ReLU())
+            convs.append(conv)
+        self.convs = nn.Sequential(*convs)
+        if self.concat_input:
+            self.conv_cat=nn.Sequential(
+                nn.Conv2D(self.in_channels+self.channels, self.channels, kernel_size=3, stride=1, padding=1),
+                nn.SyncBatchNorm(self.channels, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr),
+                nn.ReLU())
+        if dropout_ratio > 0:
+            self.dropout= nn.Dropout2D(p=dropout_ratio)
+        self.conv_seg = nn.Conv2D(self.channels, self.num_classes, kernel_size=1)
+       
+
+    def get_norm_weight_attr(self):
+        return paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0))
+
+    def forward(self, x):
+        up_resolution = [ self.up_ratio*item for item in x.shape[2:]]
+        output = self.convs(x)
+        if self.concat_input:
+            output = slef.conv_cat(paddle.concat([x, output], axis=1))
+        if self.dropout is not None:
+            output = self.dropout(output)
+        output = self.conv_seg(output)
+        output = F.interpolate(output, up_resolution, mode='bilinear', align_corners=self.align_corners)
+        return output
+            
+
+            
diff --git a/semantic_segmentation/src/models/decoders/psp_head.py b/semantic_segmentation/src/models/decoders/psp_head.py
new file mode 100644
index 00000000..d12ff5ae
--- /dev/null
+++ b/semantic_segmentation/src/models/decoders/psp_head.py
@@ -0,0 +1,42 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+class PyramidPoolingModule(nn.Layer):
+    """PyramidPoolingModule
+
+    VisionTransformerUpHead is the decoder of PSPNet, Ref https://arxiv.org/abs/1612.01105.pdf
+
+    Reference:                                                                                                                                                
+        Hengshuang Zhao, et al. *"Pyramid Scene Parsing Network"*
+    """
+    def __init__(self, pool_scales, in_channels, channels, align_corners=False):
+        super(PyramidPoolingModule, self).__init__()   
+        self.pool_scales = pool_scales
+        self.in_channels = in_channels
+        self.channels = channels
+        self.align_corners = align_corners
+        norm_bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) 
+
+        self.pool_branches = nn.LayerList()
+        for idx in range(len(self.pool_scales)):
+            self.pool_branches.append( nn.Sequential(
+                nn.AdaptiveAvgPool2D(self.pool_scales[idx]),
+                nn.Conv2D(self.in_channels, self.channels, 1, stride=1, bias_attr=False),
+                nn.SyncBatchNorm(self.channels, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr),
+                nn.ReLU())) 
+
+    def get_norm_weight_attr(self):
+        return paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0))
+
+    def forward(self, x):
+        outs = []
+        up_resolution = [item for item in x.shape[2:]]
+        for _, pool_layer in enumerate(self.pool_branches):
+            out = pool_layer(x)
+            up_out = F.interpolate(out, up_resolution, mode='bilinear', align_corners=self.align_corners)
+            outs.append(up_out)
+        return outs
+            
+
+            
diff --git a/semantic_segmentation/src/models/decoders/segformer_head.py b/semantic_segmentation/src/models/decoders/segformer_head.py
new file mode 100644
index 00000000..ec2d2655
--- /dev/null
+++ b/semantic_segmentation/src/models/decoders/segformer_head.py
@@ -0,0 +1,83 @@
+"""
+Implement The all MLP Head of Segformer
+Segformer: https://arxiv.org/abs/2105.15203 
+
+Adapted from repository below:
+    https://github.com/open-mmlab/mmsegmentation
+"""
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class ConvModule(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
+        super(ConvModule, self).__init__()
+
+        norm_bias_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Constant(0.0))
+        norm_weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Constant(value=1.0))
+
+        self.conv = nn.Conv2D(in_channels,
+                              out_channels,
+                              kernel_size,
+                              stride,
+                              bias_attr=False)
+        self.bn = nn.BatchNorm(out_channels,
+                               param_attr=norm_weight_attr,
+                               bias_attr=norm_bias_attr)
+        self.act = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+
+        return x
+
+
+class SegformerHead(nn.Layer):
+    def __init__(self, in_channels, channels, num_classes, align_corners):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.in_channels = in_channels
+        self.channels = channels
+
+        self.conv_seg = nn.Conv2D(self.channels, self.num_classes, 1, stride=1)
+        self.convs = nn.LayerList()
+        num_inputs = len(self.in_channels)
+
+        for i in range(num_inputs):
+            self.convs.append(
+                ConvModule(self.in_channels[i], self.channels, 1, 1))
+
+        self.fusion_conv = ConvModule(self.channels * num_inputs,
+                                      self.channels, 1, 1)
+
+    def get_norm_weight_attr(self):
+        return paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(
+            value=1.0))
+
+    def forward(self, inputs):
+        # Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32
+        outs = []
+        for idx in range(len(inputs)):
+            x = inputs[idx]
+            x = self.convs[idx](x)
+            outs.append(
+                F.interpolate(x,
+                              inputs[0].shape[2:],
+                              mode='bilinear',
+                              align_corners=self.align_corners))
+        out = self.fusion_conv(paddle.concat(outs, axis=1))
+
+        out = self.conv_seg(out)
+        up4x_resolution = [4 * item for item in inputs[0].shape[2:]]
+        out = F.interpolate(out,
+                            up4x_resolution,
+                            mode='bilinear',
+                            align_corners=self.align_corners)
+        return [out]
diff --git a/semantic_segmentation/src/models/decoders/segmentor_head.py b/semantic_segmentation/src/models/decoders/segmentor_head.py
new file mode 100644
index 00000000..b7d79805
--- /dev/null
+++ b/semantic_segmentation/src/models/decoders/segmentor_head.py
@@ -0,0 +1,91 @@
+import copy
+import paddle
+import paddle.nn as nn
+from src.models.backbones.vit import EncoderLayer
+
+class MaskTransformer(nn.Layer):
+    """
+    Segmenter decoder use transformer as decoder for segmentation,
+    performs better than the linear layer.
+    the decoder has the same embedding dimensions as the encoder
+    Attributes:
+        layers: nn.LayerList contains multiple EncoderLayers
+        mask_tokens: several tokens added for segmentation, each for a certain class.
+    """
+    def __init__(self, config):
+        super().__init__()
+        hidden_size = config.MODEL.TRANS.HIDDEN_SIZE
+        self.feature_size = (config.DATA.CROP_SIZE[0] // config.MODEL.TRANS.PATCH_SIZE,
+                            config.DATA.CROP_SIZE[1] // config.MODEL.TRANS.PATCH_SIZE)
+        self.cls_num = config.DATA.NUM_CLASSES
+
+        self.layers = nn.LayerList([
+            copy.deepcopy(EncoderLayer(config)) for _ in range(config.MODEL.SEGMENTER.NUM_LAYERS)])
+
+        self.mask_tokens = self.create_parameter(shape=(1, self.cls_num, hidden_size))
+
+        self.proj_decoder = nn.Linear(hidden_size, hidden_size)
+
+        weight_attr_patch = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(std=hidden_size ** -0.5)
+        )
+        self.proj_patch = nn.Linear(
+            hidden_size,
+            hidden_size,
+            weight_attr=weight_attr_patch,
+            bias_attr=False
+        )
+        weight_attr_class = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(std=hidden_size ** -0.5)
+        )
+        self.proj_class = nn.Linear(
+            hidden_size,
+            hidden_size,
+            weight_attr=weight_attr_class,
+            bias_attr=False
+        )
+
+        self.decoder_norm = nn.LayerNorm(hidden_size)
+        self.mask_norm = nn.LayerNorm(self.cls_num)
+
+    def forward(self, x):
+        H, W = self.feature_size
+        x = self.proj_decoder(x)
+        mask_tokens = self.mask_tokens.expand((x.shape[0], -1, -1))
+        x = paddle.concat([x, mask_tokens], axis=1)
+        for layer in self.layers:
+            x, _ = layer(x)
+        x = self.decoder_norm(x)
+        patches, masks = x[:, :-self.cls_num], x[:, -self.cls_num:]
+        patches = self.proj_patch(patches)
+        masks = self.proj_class(masks)
+        patches = patches / paddle.norm(patches, axis=-1, keepdim=True)
+        masks = masks / paddle.norm(masks, axis=-1, keepdim=True)
+        masks = patches @ masks.transpose((0, 2, 1))
+        masks = self.mask_norm(masks)
+        #[b, (h w), n] -> [b, n, h, w]
+        masks = masks.reshape((masks.shape[0], H, W, masks.shape[-1]))
+        masks = masks.transpose((0, 3, 1, 2))
+
+        return masks
+
+class LinearDecoder(nn.Layer):
+    """
+    simple linear decoder with only one linear layer and the step to
+    resize the one-dimensional vectors to two-dimensional masks.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.feature_size = (config.DATA.CROP_SIZE[0] // config.MODEL.TRANS.PATCH_SIZE,
+                             config.DATA.CROP_SIZE[1] // config.MODEL.TRANS.PATCH_SIZE)
+        self.head = nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE, config.DATA.NUM_CLASSES)
+
+    def forward(self, x):
+        H, W = self.feature_size
+
+        masks = self.head(x)
+        #[b, (h w), n] -> [b, n, h, w]
+        masks = masks.reshape((masks.shape[0], H, W, masks.shape[-1]))
+        masks = masks.transpose((0, 3, 1, 2))
+
+        return masks
\ No newline at end of file
diff --git a/semantic_segmentation/src/models/decoders/trans2seg_head.py b/semantic_segmentation/src/models/decoders/trans2seg_head.py
new file mode 100644
index 00000000..229d8ed2
--- /dev/null
+++ b/semantic_segmentation/src/models/decoders/trans2seg_head.py
@@ -0,0 +1,115 @@
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..backbones import expand
+
+
+class ConvBNReLU(nn.Layer):
+    '''ConvBNReLU
+    
+    Just contains Conv-BN-ReLU layer
+    '''
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0,
+                 dilation=1, groups=1, relu6=False, norm_layer=nn.BatchNorm2D):
+        super(ConvBNReLU, self).__init__()
+        self.conv = nn.Conv2D(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias_attr=False)
+        self.bn = norm_layer(out_channels)
+        self.relu = nn.ReLU6(True) if relu6 else nn.ReLU(True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class HybridEmbed(nn.Layer):
+    """ CNN Feature Map Embedding
+    
+    Extract feature map from CNN, flatten, project to embedding dim.
+    
+    Attributes:
+        input_dim: int, input dimension, default: 2048
+        embed_dim: int, embedding dimension, default: 768
+    """
+    def __init__(self, input_dim=2048, embed_dim=768):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, embed_dim)
+
+    def forward(self, x):
+        x = x.flatten(2).transpose([0, 2, 1])
+        x = self.proj(x)
+        return x
+
+
+class SeparableConv2d(nn.Layer):
+    '''Separable Conv2D
+    
+    Depthwise Separable Convolution, Ref, https://arxiv.org/pdf/1610.02357.pdf
+    '''
+    def __init__(self, inplanes, planes, kernel_size=3, stride=1, dilation=1, relu_first=True,
+                 bias=False, norm_layer=nn.BatchNorm2D):
+        super().__init__()
+        depthwise = nn.Conv2D(inplanes, inplanes, kernel_size,
+                              stride=stride, padding=dilation,
+                              dilation=dilation, groups=inplanes, bias_attr=bias)
+        bn_depth = norm_layer(inplanes)
+        pointwise = nn.Conv2D(inplanes, planes, 1, bias_attr=bias)
+        bn_point = norm_layer(planes)
+
+        if relu_first:
+            self.block = nn.Sequential(('relu', nn.ReLU()),
+                                       ('depthwise', depthwise),
+                                       ('bn_depth', bn_depth),
+                                       ('pointwise', pointwise),
+                                       ('bn_point', bn_point)
+            )
+        else:
+            self.block = nn.Sequential(('depthwise', depthwise),
+                                       ('bn_depth', bn_depth),
+                                       ('relu1', nn.ReLU()),
+                                       ('pointwise', pointwise),
+                                       ('bn_point', bn_point),
+                                       ('relu2', nn.ReLU())
+            )
+
+    def forward(self, x):
+        return self.block(x)
+
+
+class CNNHEAD(nn.Layer):
+    """CNNHEAD Implement
+    
+    Attributes:
+        vit_params: dict, input hyper params
+        c1_channels: int, input channels, default, 256
+        hid_dim: int, hidden dimension, default, 64
+        norm_layer: normalization layer type, default: nn.BatchNorm2D
+    """
+    def __init__(self, vit_params, c1_channels=256, hid_dim=64, norm_layer=nn.BatchNorm2D):
+        super().__init__()
+
+        last_channels = vit_params['EMBED_DIM']
+        nhead = vit_params['NUM_HEADS']
+        self.conv_c1 = ConvBNReLU(c1_channels, hid_dim, 1, norm_layer=norm_layer)
+        
+        self.lay1 = SeparableConv2d(last_channels+nhead, hid_dim, 3, norm_layer=norm_layer, relu_first=False)
+        self.lay2 = SeparableConv2d(hid_dim, hid_dim, 3, norm_layer=norm_layer, relu_first=False)
+        self.lay3 = SeparableConv2d(hid_dim, hid_dim, 3, norm_layer=norm_layer, relu_first=False)
+
+        self.pred = nn.Conv2D(hid_dim, 1, 1)
+    
+    def forward(self, x, c1, nclass, B):
+        x = self.lay1(x)
+        x = self.lay2(x)
+
+        size = c1.shape[2:]
+        x = F.interpolate(x, size, mode='bilinear', align_corners=True)
+        c1 = self.conv_c1(c1)
+        x = x + expand(c1, nclass)
+
+        x = self.lay3(x)
+        x = self.pred(x).reshape([B, nclass, size[0], size[1]])
+        
+        return x
diff --git a/semantic_segmentation/src/models/decoders/uper_head.py b/semantic_segmentation/src/models/decoders/uper_head.py
new file mode 100644
index 00000000..716864c0
--- /dev/null
+++ b/semantic_segmentation/src/models/decoders/uper_head.py
@@ -0,0 +1,103 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from .psp_head import PyramidPoolingModule 
+
+
+class UperHead(nn.Layer):
+    """UperHead
+
+    UperHead is the decoder of UperNet, Ref https://arxiv.org/pdf/1807.10221.pdf
+    Reference:                                                                                                                                                
+        Tete Xiao, et al. *"Unified Perceptual Parsing for Scene Understanding"*
+    """
+   
+    def __init__(self, pool_scales, in_channels, channels, align_corners=False, num_classes=60):
+        super(UperHead, self).__init__()
+        self.pool_scales = pool_scales
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.in_channels = in_channels
+        self.channels = channels
+        norm_bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) 
+        # PSP module
+        self.psp_modules = PyramidPoolingModule(self.pool_scales, self.in_channels[-1], self.channels, self.align_corners)
+        self.bottleneck = nn.Sequential(
+            nn.Conv2D(self.in_channels[-1] + len(self.pool_scales)*self.channels, \
+                self.channels, kernel_size=3, stride=1, padding=1, bias_attr=False),
+            nn.SyncBatchNorm(self.channels, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr),
+            nn.ReLU())
+        # FPN module
+        self.lateral_convs = nn.LayerList()
+        self.fpn_convs = nn.LayerList()
+        for in_channel in self.in_channels[:-1]: # skip the top layer
+           l_conv = nn.Sequential(
+                nn.Conv2D(in_channel, self.channels, kernel_size=1, stride=1, bias_attr=False),
+                nn.SyncBatchNorm(self.channels, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr),
+                nn.ReLU())
+           fpn_conv = nn.Sequential(
+                nn.Conv2D(self.channels, self.channels, kernel_size=3, stride=1, padding=1, bias_attr=False),
+                nn.SyncBatchNorm(self.channels, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr),
+                nn.ReLU())
+           self.lateral_convs.append(l_conv)
+           self.fpn_convs.append(fpn_conv)
+        # FPN bottleneck
+        self.fpn_bottleneck = nn.Sequential(
+                nn.Conv2D(len(self.in_channels)*self.channels, self.channels, kernel_size=3,\
+                    stride=1, padding=1, bias_attr=False),
+                nn.SyncBatchNorm(self.channels, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr),
+                nn.ReLU())
+        self.conv_seg = nn.Conv2D(self.channels, self.num_classes, 1, stride=1)
+
+
+    def get_norm_weight_attr(self):
+        return paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0))
+
+    def to_2D(self, x):
+        n, hw, c = x.shape
+        h = w = int(math.sqrt(hw))
+        x = x.transpose([0, 2, 1]).reshape([n, c, h, w])
+        return x
+
+    def psp_forward(self, x):
+        psp_outs = [x]
+        psp_outs.extend(self.psp_modules(x))
+        psp_outs = paddle.concat(psp_outs, axis=1)  
+        out = self.bottleneck(psp_outs) 
+        return out
+
+    def forward(self, inputs):
+        up4x_resolution = [ 4*item for item in inputs[0].shape[2:]]
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        laterals.append(self.psp_forward(inputs[-1]))
+        # build top-down fusion
+        used_feat_levels = len(laterals)
+        for idx in range(used_feat_levels-1, 0, -1):
+           prev_size = laterals[idx-1].shape[2:]
+           laterals[idx-1] += F.interpolate(laterals[idx], prev_size, mode='bilinear', align_corners=self.align_corners) 
+        # build fpn-output
+        fpn_outs = [
+            self.fpn_convs[idx](laterals[idx])
+            for idx in range(used_feat_levels - 1)
+        ]
+        # add features from psp module
+        fpn_outs.append(laterals[-1])
+
+        # upsample feats from all level to the same size   
+        for idx in range(used_feat_levels -1, 0, -1):
+            fpn_outs[idx] = F.interpolate(
+                fpn_outs[idx], 
+                fpn_outs[0].shape[2:], 
+                mode='bilinear', 
+                align_corners=self.align_corners)
+        fpn_outs = paddle.concat(fpn_outs, axis=1)
+        output = self.fpn_bottleneck(fpn_outs)
+        output = self.conv_seg(output)
+        output = F.interpolate(output, up4x_resolution, mode='bilinear', align_corners=self.align_corners)
+        return output
+
+
diff --git a/semantic_segmentation/src/models/decoders/vit_mla_auxi_head.py b/semantic_segmentation/src/models/decoders/vit_mla_auxi_head.py
new file mode 100644
index 00000000..bffaf9e8
--- /dev/null
+++ b/semantic_segmentation/src/models/decoders/vit_mla_auxi_head.py
@@ -0,0 +1,39 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class VIT_MLA_AUXIHead(nn.Layer):
+    """VIT_MLA_AUXIHead
+ 
+    VIT_MLA_AUXIHead is the auxiliary segmentation decoder of SETR-MLA
+    Reference:                                                                                                                                                
+        Sixiao Zheng, et al. *"Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers"*
+    """
+
+    def __init__(self, in_channels = 256, num_classes = 60, align_corners= False):
+        super(VIT_MLA_AUXIHead, self).__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        if self.in_channels == 1024:
+            self.aux_0 = nn.Conv2D(self.in_channels, 256, 
+                                   kernel_size=1, bias_attr=False)
+            self.aux_1 = nn.Conv2D(256, self.num_classes, 
+                                   kernel_size=1, bias_attr=False)
+        elif self.in_channels == 256:
+            self.aux = nn.Conv2D(self.in_channels, self.num_classes, 
+                                 kernel_size=1, bias_attr=False)
+
+
+    def forward(self, x):
+        up16x_resolution = [ 16*item for item in x.shape[2:]]
+        if self.in_channels == 1024:
+            x = self.aux_0(x)
+            aux_pred = self.aux_1(x)
+        elif self.in_channels == 256:
+            aux_pred = self.aux(x)
+        aux_pred_full = F.interpolate(
+            aux_pred, up16x_resolution, mode='bilinear', 
+            align_corners=self.align_corners)
+        return aux_pred_full
diff --git a/semantic_segmentation/src/models/decoders/vit_mla_head.py b/semantic_segmentation/src/models/decoders/vit_mla_head.py
new file mode 100644
index 00000000..a9750d6d
--- /dev/null
+++ b/semantic_segmentation/src/models/decoders/vit_mla_head.py
@@ -0,0 +1,140 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class VIT_MLAHead(nn.Layer):
+    """VIT_MLAHead
+
+    VIT_MLAHead is the decoder of SETR-MLA, Ref https://arxiv.org/pdf/2012.15840.pdf.
+    """
+
+    def __init__(self, mla_channels=256, mlahead_channels=128, 
+                 num_classes=60, align_corners=False):
+        super(VIT_MLAHead, self).__init__()
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        sync_norm_bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0))
+        self.head2 = nn.Sequential(
+            nn.Conv2D(
+                mla_channels,
+                mlahead_channels,
+                3, 
+                padding=1,
+                bias_attr=False),
+            nn.SyncBatchNorm(
+                mlahead_channels,
+                weight_attr=self.get_sync_norm_weight_attr(),
+                bias_attr=sync_norm_bias_attr),
+            nn.ReLU(),
+            nn.Conv2D(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            nn.SyncBatchNorm(
+                mlahead_channels,
+                weight_attr=self.get_sync_norm_weight_attr(),
+                bias_attr=sync_norm_bias_attr),
+            nn.ReLU())
+        self.head3 = nn.Sequential(
+            nn.Conv2D(
+                mla_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            nn.SyncBatchNorm(
+                mlahead_channels,
+                weight_attr=self.get_sync_norm_weight_attr(),
+                bias_attr=sync_norm_bias_attr),
+            nn.ReLU(),
+            nn.Conv2D(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            nn.SyncBatchNorm(
+                mlahead_channels,
+                weight_attr=self.get_sync_norm_weight_attr(),
+                bias_attr=sync_norm_bias_attr),
+            nn.ReLU())
+        self.head4 = nn.Sequential(
+            nn.Conv2D(
+                mla_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            nn.SyncBatchNorm(
+                mlahead_channels,
+                weight_attr=self.get_sync_norm_weight_attr(),
+                bias_attr=sync_norm_bias_attr),
+            nn.ReLU(),
+            nn.Conv2D(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            nn.SyncBatchNorm(
+                mlahead_channels,
+                weight_attr=self.get_sync_norm_weight_attr(),
+                bias_attr=sync_norm_bias_attr),
+            nn.ReLU())
+        self.head5 = nn.Sequential(
+            nn.Conv2D(
+                mla_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            nn.SyncBatchNorm(
+                mlahead_channels,
+                weight_attr=self.get_sync_norm_weight_attr(),
+                bias_attr=sync_norm_bias_attr),
+            nn.ReLU(),
+            nn.Conv2D(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            nn.SyncBatchNorm(
+                mlahead_channels,
+                weight_attr=self.get_sync_norm_weight_attr(),
+                bias_attr=sync_norm_bias_attr),
+            nn.ReLU())
+        self.cls = nn.Conv2D(4*mlahead_channels, self.num_classes, 3, padding=1)
+
+    def get_sync_norm_weight_attr(self):
+        return paddle.ParamAttr(
+            initializer=nn.initializer.Uniform(low=0.0, high=1.0, name=None)) 
+
+    def forward(self, mla_p2, mla_p3, mla_p4, mla_p5):
+        up4x_resolution = [4*item for item in mla_p2.shape[2:]]
+        up16x_resolution = [16*item for item in mla_p2.shape[2:]]
+        # head2: 2 Conv layers + 4x_upsmaple
+        h2_out = self.head2(mla_p2)
+        h2_out_x4 = F.interpolate(h2_out, up4x_resolution, 
+                                  mode='bilinear', align_corners=True)
+        h3_out = self.head3(mla_p3)
+        h3_out_x4 = F.interpolate(h3_out, up4x_resolution, 
+                                  mode='bilinear', align_corners=True)
+        h4_out = self.head4(mla_p4)
+        h4_out_x4 = F.interpolate(h4_out, up4x_resolution, 
+                                  mode='bilinear', align_corners=True)
+        h5_out = self.head5(mla_p5)
+        h5_out_x4 = F.interpolate(h5_out, up4x_resolution, 
+                                  mode='bilinear', align_corners=True)
+        # concatenating multi-head
+        hout_concat = paddle.concat([h2_out_x4, h3_out_x4, 
+                                     h4_out_x4, h5_out_x4], axis=1) 
+        # pixel-level cls.
+        pred = self.cls(hout_concat) # (B, num_classes, H/4, W/4)
+        pred_full = F.interpolate(
+            pred, up16x_resolution, mode='bilinear', 
+            align_corners=self.align_corners)
+        return pred_full
diff --git a/semantic_segmentation/src/models/decoders/vit_up_head.py b/semantic_segmentation/src/models/decoders/vit_up_head.py
new file mode 100644
index 00000000..c8fe077a
--- /dev/null
+++ b/semantic_segmentation/src/models/decoders/vit_up_head.py
@@ -0,0 +1,106 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+import copy
+import numpy as np
+import math
+
+
+class VisionTransformerUpHead(nn.Layer):
+    """VisionTransformerUpHead
+
+    VisionTransformerUpHead is the decoder of SETR-PUP, Ref https://arxiv.org/pdf/2012.15840.pdf
+
+    Reference:                                                                                                                                                
+        Sixiao Zheng, et al. *"Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers"*
+    """
+
+    def __init__(self, embed_dim=1024, num_conv=1, num_upsample_layer=1, 
+            conv3x3_conv1x1=True, align_corners=False, num_classes=60):
+        super(VisionTransformerUpHead, self).__init__()
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.num_conv = num_conv
+        self.num_upsample_layer = num_upsample_layer
+        self.conv3x3_conv1x1 = conv3x3_conv1x1
+
+        norm_bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) 
+        self.norm = nn.LayerNorm(embed_dim, epsilon=1e-06, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr)
+
+        if self.num_conv == 2:
+            if self.conv3x3_conv1x1:
+                self.conv_0 = nn.Conv2D(embed_dim, 256, 3, stride=1, padding=1, bias_attr=True)
+            else:
+                self.conv_0 = nn.Conv2D(embed_dim, 256, 1, stride=1, bias_attr=True)
+            self.conv_1 = nn.Conv2D(256, self.num_classes, 1, stride=1)
+            self.syncbn_fc_0 = nn.SyncBatchNorm(256, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr)
+        
+        elif self.num_conv == 4:
+            self.conv_0 = nn.Conv2D(embed_dim, 256, kernel_size=3, stride=1, padding=1)
+            self.conv_1 = nn.Conv2D(256, 256, kernel_size=3, stride=1, padding=1)
+            self.conv_2 = nn.Conv2D(256, 256, kernel_size=3, stride=1, padding=1)
+            self.conv_3 = nn.Conv2D(256, 256, kernel_size=3, stride=1, padding=1)
+            self.conv_4 = nn.Conv2D(256, self.num_classes, kernel_size=1, stride=1)
+            self.syncbn_fc_0 = nn.SyncBatchNorm(256, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr)
+            self.syncbn_fc_1 = nn.SyncBatchNorm(256, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr)
+            self.syncbn_fc_2 = nn.SyncBatchNorm(256, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr)
+            self.syncbn_fc_3 = nn.SyncBatchNorm(256, weight_attr=self.get_norm_weight_attr(), bias_attr=norm_bias_attr)
+
+       
+
+    def get_norm_weight_attr(self):
+        return paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0))
+
+    def to_2D(self, x):
+        n, hw, c = x.shape
+        h = w = int(math.sqrt(hw))
+        x = x.transpose([0, 2, 1]).reshape([n, c, h, w])
+        return x
+
+    def forward(self, x):
+        x = self.norm(x)
+        # (b,hw,c) -> (b,c,h,w)
+        x = self.to_2D(x)
+        up4x_resolution = [ 4*item for item in x.shape[2:]]
+        up16x_resolution = [ 16*item for item in x.shape[2:]]
+        if self.num_conv == 2:
+            if self.num_upsample_layer == 2:
+                x = self.conv_0(x)
+                x = self.syncbn_fc_0(x)
+                x = F.relu(x)
+                x = F.interpolate(x, up4x_resolution, mode='bilinear', align_corners=self.align_corners)
+                x = self.conv_1(x)
+                x = F.interpolate(x, up16x_resolution, mode='bilinear', align_corners=self.align_corners)
+            elif self.num_upsample_layer == 1:
+                x = self.conv_0(x)
+                x = self.syncbn_fc_0(x)
+                x = F.relu(x)
+                x = self.conv_1(x)
+                x = F.interpolate(x, up16x_resolution, mode='bilinear', align_corners=self.align_corners)
+        elif self.num_conv == 4:
+            if self.num_upsample_layer == 4:
+                x = self.conv_0(x)
+                x = self.syncbn_fc_0(x)
+                x = F.relu(x)
+                up2x_resolution = [ 2*item for item in x.shape[2:]]
+                x = F.interpolate(x, up2x_resolution, mode='bilinear', align_corners=self.align_corners)
+                x = self.conv_1(x)
+                x = self.syncbn_fc_1(x)
+                x = F.relu(x)
+                up2x_resolution = [ 2*item for item in x.shape[2:]]
+                x = F.interpolate(x, up2x_resolution, mode='bilinear', align_corners=self.align_corners)
+                x = self.conv_2(x)
+                x = self.syncbn_fc_2(x)
+                x = F.relu(x)
+                up2x_resolution = [ 2*item for item in x.shape[2:]]
+                x = F.interpolate(x, up2x_resolution, mode='bilinear', align_corners=self.align_corners)
+                x = self.conv_3(x)
+                x = self.syncbn_fc_3(x)
+                x = F.relu(x)
+                x = self.conv_4(x)
+                up2x_resolution = [ 2*item for item in x.shape[2:]]
+                x = F.interpolate(x, up2x_resolution, mode='bilinear', align_corners=self.align_corners)
+        return x
+
+
diff --git a/semantic_segmentation/src/models/dpt.py b/semantic_segmentation/src/models/dpt.py
new file mode 100644
index 00000000..6f5dc7bb
--- /dev/null
+++ b/semantic_segmentation/src/models/dpt.py
@@ -0,0 +1,27 @@
+"""                                                                                                                                                                                                                 
+This module implements DPT
+Vision Transformers for Dense Prediction
+<https://arxiv.org/abs/2103.13413.pdf>
+"""
+
+import paddle
+import paddle.nn as nn
+from .backbones.vit import VisualTransformer
+from .decoders.dpt_head import DPTHead
+
+class DPTSeg(nn.Layer):
+    """DPT Segmentation model
+    """
+    def __init__(self, config):
+        super(DPTSeg, self).__init__()
+        self.backbone = VisualTransformer(config)
+        self.head = DPTHead(config)
+
+    def forward(self, inputs):
+        features = self.backbone(inputs)
+        out = self.head(features)
+        return out
+
+    def init__decoder_lr_coef(self, coef):
+        for param in self.head.parameters():
+            param.optimize_attr['learning_rate'] = coef
\ No newline at end of file
diff --git a/semantic_segmentation/src/models/losses/__init__.py b/semantic_segmentation/src/models/losses/__init__.py
new file mode 100644
index 00000000..3c4d7943
--- /dev/null
+++ b/semantic_segmentation/src/models/losses/__init__.py
@@ -0,0 +1 @@
+from .cross_entropy_loss import CrossEntropyLoss
diff --git a/semantic_segmentation/src/models/losses/cross_entropy_loss.py b/semantic_segmentation/src/models/losses/cross_entropy_loss.py
new file mode 100644
index 00000000..c90c069a
--- /dev/null
+++ b/semantic_segmentation/src/models/losses/cross_entropy_loss.py
@@ -0,0 +1,59 @@
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+
+class CrossEntropyLoss(nn.Layer):
+    """
+    Implements the cross entropy loss function.
+
+    Args:
+        weight (ndarray, optional): A manual rescaling weight given to each 
+        class. Its length must be equal to the number of classes. Default ``None``.
+        ignore_index (int64, optional): The ignored class. 
+    """
+
+    def __init__(self, weight=None, ignore_index=255):
+        super(CrossEntropyLoss, self).__init__()
+        if weight is not None:
+            weight = paddle.to_tensor(weight, dtype='float32')
+        self.weight = weight
+        self.ignore_index = ignore_index
+        self.eps = 1e-8
+
+    def forward(self, logit, label, semantic_weights=None):
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor. shape: (B,C,H,W)
+            label (Tensor): Label tensor, the data type is int64. shape: (B,H,W)
+        """
+        if self.weight is not None and logit.shape[1] != len(self.weight):
+            raise ValueError(
+                'The number of weights = {} must be the same as the number of classes = {}.'
+                .format(len(self.weight), logit.shape[1]))
+
+        logit = paddle.transpose(logit, [0, 2, 3, 1])
+        if self.weight is None:
+            loss = F.cross_entropy(
+                logit, label, ignore_index=self.ignore_index, reduction='none')
+        else:
+            label_one_hot = F.one_hot(label, logit.shape[-1])
+            loss = F.cross_entropy(
+                logit,
+                label_one_hot * self.weight,
+                soft_label=True,
+                ignore_index=self.ignore_index,
+                reduction='none')
+            loss = loss.squeeze(-1)
+
+        mask = label != self.ignore_index
+        mask = paddle.cast(mask, 'float32')
+        loss = loss * mask
+        if semantic_weights is not None:
+            loss = loss * semantic_weights
+        label.stop_gradient = True
+        mask.stop_gradient = True
+        avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.eps)
+        return avg_loss
diff --git a/semantic_segmentation/src/models/losses/ohem_cross_entropy_loss.py b/semantic_segmentation/src/models/losses/ohem_cross_entropy_loss.py
new file mode 100644
index 00000000..ce14348e
--- /dev/null
+++ b/semantic_segmentation/src/models/losses/ohem_cross_entropy_loss.py
@@ -0,0 +1,83 @@
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+
+
+class OhemCrossEntropyLoss(nn.Layer):
+    """
+    Implements the ohem cross entropy loss function.
+
+    Args:
+        thresh (float, optional): The threshold of ohem. Default: 0.7.
+        min_kept (int, optional): The min number to keep in loss computation. Default: 10000.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self, thresh=0.7, min_kept=10000, ignore_index=255):
+        super(OhemCrossEntropyLoss, self).__init__()
+        self.thresh = thresh
+        self.min_kept = min_kept
+        self.ignore_index = ignore_index
+        self.EPS = 1e-5
+
+    def forward(self, logit, label):
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+        """
+        if len(label.shape) != len(logit.shape):
+            label = paddle.unsqueeze(label, 1)
+
+        # get the label after ohem
+        n, c, h, w = logit.shape
+        label = label.reshape((-1, ))
+        valid_mask = (label != self.ignore_index).astype('int64')
+        num_valid = valid_mask.sum()
+        label = label * valid_mask
+
+        prob = F.softmax(logit, axis=1)
+        prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1))
+
+        if self.min_kept < num_valid and num_valid > 0:
+            # let the value which ignored greater than 1
+            prob = prob + (1 - valid_mask)
+
+            # get the prob of relevant label
+            label_onehot = F.one_hot(label, c)
+            label_onehot = label_onehot.transpose((1, 0))
+            prob = prob * label_onehot
+            prob = paddle.sum(prob, axis=0)
+
+            threshold = self.thresh
+            if self.min_kept > 0:
+                index = prob.argsort()
+                threshold_index = index[min(len(index), self.min_kept) - 1]
+                threshold_index = int(threshold_index.numpy()[0])
+                if prob[threshold_index] > self.thresh:
+                    threshold = prob[threshold_index]
+                kept_mask = (prob < threshold).astype('int64')
+                label = label * kept_mask
+                valid_mask = valid_mask * kept_mask
+
+        # make the invalid region as ignore
+        label = label + (1 - valid_mask) * self.ignore_index
+
+        label = label.reshape((n, 1, h, w))
+        valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32')
+        loss = F.softmax_with_cross_entropy(
+            logit, label, ignore_index=self.ignore_index, axis=1)
+        loss = loss * valid_mask
+        avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS)
+
+        label.stop_gradient = True
+        valid_mask.stop_gradient = True
+        return avg_loss
diff --git a/semantic_segmentation/src/models/segformer.py b/semantic_segmentation/src/models/segformer.py
new file mode 100644
index 00000000..a3dfba0e
--- /dev/null
+++ b/semantic_segmentation/src/models/segformer.py
@@ -0,0 +1,38 @@
+import paddle.nn as nn
+
+from .backbones.mix_transformer import MixVisionTransformer
+from .decoders.segformer_head import SegformerHead
+
+
+class Segformer(nn.Layer):
+    """Segformer model implementation
+    
+    """
+    def __init__(self, config):
+        super(Segformer, self).__init__()
+        self.backbone = MixVisionTransformer(
+            in_channels=config.MODEL.TRANS.IN_CHANNELS,
+            embed_dims=config.MODEL.TRANS.EMBED_DIM,
+            num_stages=config.MODEL.TRANS.NUM_STAGES,
+            num_layers=config.MODEL.TRANS.NUM_LAYERS,
+            num_heads=config.MODEL.TRANS.NUM_HEADS,
+            patch_sizes=config.MODEL.TRANS.PATCH_SIZE,
+            strides=config.MODEL.TRANS.STRIDES,
+            sr_ratios=config.MODEL.TRANS.SR_RATIOS,
+            out_indices=config.MODEL.ENCODER.OUT_INDICES,
+            mlp_ratio=config.MODEL.TRANS.MLP_RATIO,
+            qkv_bias=config.MODEL.TRANS.QKV_BIAS,
+            drop_rate=config.MODEL.DROPOUT,
+            attn_drop_rate=config.MODEL.ATTENTION_DROPOUT,
+            drop_path_rate=config.MODEL.DROP_PATH,
+            pretrained=config.MODEL.PRETRAINED)
+        self.decode_head = SegformerHead(
+            in_channels=config.MODEL.SEGFORMER.IN_CHANNELS,
+            channels=config.MODEL.SEGFORMER.CHANNELS,
+            num_classes=config.DATA.NUM_CLASSES,
+            align_corners=config.MODEL.SEGFORMER.ALIGN_CORNERS)
+
+    def forward(self, inputs):
+        features = self.backbone(inputs)
+        out = self.decode_head(features)
+        return out
\ No newline at end of file
diff --git a/semantic_segmentation/src/models/segmentor.py b/semantic_segmentation/src/models/segmentor.py
new file mode 100644
index 00000000..a592e7e3
--- /dev/null
+++ b/semantic_segmentation/src/models/segmentor.py
@@ -0,0 +1,45 @@
+import paddle
+from paddle.fluid.layers.nn import size
+import paddle.nn as nn
+from src.models.backbones import VisualTransformer
+from src.models.backbones import Deit
+from src.models.decoders import MaskTransformer
+from src.models.decoders import LinearDecoder
+
+class Segmentor(nn.Layer):
+    """
+    Segmenter model implementation
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.img_size = config.DATA.CROP_SIZE
+        if 'ViT' in config.MODEL.ENCODER.TYPE:
+            self.encoder = VisualTransformer(config)
+        elif 'DeiT' in config.MODEL.ENCODER.TYPE:
+            self.encoder = Deit(config)
+        if 'MaskTransformer' in config.MODEL.DECODER_TYPE:
+            self.decoder = MaskTransformer(config)
+        elif 'Linear' in config.MODEL.DECODER_TYPE:
+            self.decoder = LinearDecoder(config)
+        self.norm = nn.LayerNorm(config.MODEL.TRANS.HIDDEN_SIZE)
+        self.token_num = 2 if 'DeiT' in config.MODEL.ENCODER.TYPE else 1
+        self.init__decoder_lr_coef(config.TRAIN.DECODER_LR_COEF)
+
+    def init__decoder_lr_coef(self, coef):
+        for param in self.decoder.parameters():
+            param.optimize_attr['learning_rate'] = coef
+
+    def forward(self, x):
+        x = self.encoder(x)
+        x = x[-1]
+        x = self.norm(x)
+        x = x[:, self.token_num:]
+        masks = self.decoder(x)
+        
+        masks = nn.functional.interpolate(
+            masks,
+            size=self.img_size,
+            mode="bilinear"
+        )
+
+        return [masks]
\ No newline at end of file
diff --git a/semantic_segmentation/src/models/setr.py b/semantic_segmentation/src/models/setr.py
new file mode 100644
index 00000000..9ee31277
--- /dev/null
+++ b/semantic_segmentation/src/models/setr.py
@@ -0,0 +1,152 @@
+"""                                                                                                                                                                                                                 
+This module implements SETR
+Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers
+<https://arxiv.org/pdf/2012.15840.pdf>
+"""
+
+import paddle
+import paddle.nn as nn
+from src.models.backbones import ViT_MLA, VisualTransformer
+from src.models.decoders import VIT_MLAHead, VIT_MLA_AUXIHead, VisionTransformerUpHead
+from src.utils import load_pretrained_model
+
+
+class SETR(nn.Layer):
+    """ SETR
+
+    SEgmentation TRansformer (SETR) has three diffrent decoder designs to 
+    perform pixl-level segmentation. The variants of SETR includes SETR_MLA, 
+    SETR_PUP, and SETR_Naive.
+
+    Attributes:
+        encoder: A backbone network for extract features from image.
+        auxi_head: A boolena indicating if we employ the auxilary segmentation head.
+        decoder_type: Type of decoder.
+        decoder: A decoder module for semantic segmentation.
+    """
+    def __init__(self, config):
+        super(SETR, self).__init__()
+        if config.MODEL.ENCODER.TYPE == "ViT_MLA":
+            self.encoder = ViT_MLA(config)
+        elif config.MODEL.ENCODER.TYPE == "ViT":
+            self.encoder = VisualTransformer(config)
+        self.auxi_head = config.MODEL.AUX.AUXIHEAD
+        self.decoder_type = config.MODEL.DECODER_TYPE
+
+        if self.decoder_type == "VIT_MLAHead":
+            self.decoder = VIT_MLAHead(
+                config.MODEL.MLA.MLA_CHANNELS,
+                config.MODEL.MLA.MLAHEAD_CHANNELS, 
+                config.DATA.NUM_CLASSES,
+                config.MODEL.MLA.MLAHEAD_ALIGN_CORNERS) 
+            self.auxi_head = config.MODEL.AUX.AUXIHEAD
+            if self.auxi_head == True:
+                self.aux_decoder2 = VIT_MLA_AUXIHead(
+                    config.MODEL.MLA.MLA_CHANNELS, 
+                    config.DATA.NUM_CLASSES, 
+                    config.MODEL.AUX.AUXHEAD_ALIGN_CORNERS) 
+                self.aux_decoder3 = VIT_MLA_AUXIHead(
+                    config.MODEL.MLA.MLA_CHANNELS, 
+                    config.DATA.NUM_CLASSES, 
+                    config.MODEL.AUX.AUXHEAD_ALIGN_CORNERS) 
+                self.aux_decoder4 = VIT_MLA_AUXIHead(
+                    config.MODEL.MLA.MLA_CHANNELS, 
+                    config.DATA.NUM_CLASSES, 
+                    config.MODEL.AUX.AUXHEAD_ALIGN_CORNERS) 
+                self.aux_decoder5 = VIT_MLA_AUXIHead(
+                    config.MODEL.MLA.MLA_CHANNELS, 
+                    config.DATA.NUM_CLASSES, 
+                    config.MODEL.AUX.AUXHEAD_ALIGN_CORNERS) 
+
+        elif (self.decoder_type == "PUP_VisionTransformerUpHead" or 
+              self.decoder_type == "Naive_VisionTransformerUpHead"):
+            self.decoder = VisionTransformerUpHead(
+                config.MODEL.PUP.INPUT_CHANNEL, 
+                config.MODEL.PUP.NUM_CONV, 
+                config.MODEL.PUP.NUM_UPSAMPLE_LAYER, 
+                config.MODEL.PUP.CONV3x3_CONV1x1, 
+                config.MODEL.PUP.ALIGN_CORNERS, 
+                config.DATA.NUM_CLASSES)
+            if self.auxi_head == True:
+                self.aux_decoder2 = VisionTransformerUpHead(
+                    config.MODEL.AUXPUP.INPUT_CHANNEL, 
+                    config.MODEL.AUXPUP.NUM_CONV, 
+                    config.MODEL.AUXPUP.NUM_UPSAMPLE_LAYER, 
+                    config.MODEL.AUXPUP.CONV3x3_CONV1x1, 
+                    config.MODEL.AUXPUP.ALIGN_CORNERS, 
+                    config.DATA.NUM_CLASSES)
+                self.aux_decoder3 = VisionTransformerUpHead(
+                    config.MODEL.AUXPUP.INPUT_CHANNEL, 
+                    config.MODEL.AUXPUP.NUM_CONV,  
+                    config.MODEL.AUXPUP.NUM_UPSAMPLE_LAYER, 
+                    config.MODEL.AUXPUP.CONV3x3_CONV1x1, 
+                    config.MODEL.AUXPUP.ALIGN_CORNERS, 
+                    config.DATA.NUM_CLASSES)
+                self.aux_decoder4 = VisionTransformerUpHead(
+                    config.MODEL.AUXPUP.INPUT_CHANNEL, 
+                    config.MODEL.AUXPUP.NUM_CONV,
+                    config.MODEL.AUXPUP.NUM_UPSAMPLE_LAYER, 
+                    config.MODEL.AUXPUP.CONV3x3_CONV1x1, 
+                    config.MODEL.AUXPUP.ALIGN_CORNERS, 
+                    config.DATA.NUM_CLASSES)
+                if self.decoder_type == "PUP_VisionTransformerUpHead":
+                    self.aux_decoder5 = VisionTransformerUpHead(
+                        config.MODEL.AUXPUP.INPUT_CHANNEL, 
+                        config.MODEL.AUXPUP.NUM_CONV,
+                        config.MODEL.AUXPUP.NUM_UPSAMPLE_LAYER, 
+                        config.MODEL.AUXPUP.CONV3x3_CONV1x1, 
+                        config.MODEL.AUXPUP.ALIGN_CORNERS, 
+                        config.DATA.NUM_CLASSES)
+        self.init__decoder_lr_coef(config)
+    
+    def init__decoder_lr_coef(self, config):
+        #print("self.decoder.sublayers(): ", self.decoder.sublayers())
+        for sublayer in self.decoder.sublayers():
+            #print("F sublayer: ", sublayer)
+            if isinstance(sublayer, nn.Conv2D):
+                #print("sublayer: ", sublayer)
+                sublayer.weight.optimize_attr['learning_rate'] = config.TRAIN.DECODER_LR_COEF
+                if sublayer.bias is not None:
+                    sublayer.bias.optimize_attr['learning_rate'] = config.TRAIN.DECODER_LR_COEF
+            if (isinstance(sublayer, nn.SyncBatchNorm) or 
+               isinstance(sublayer, nn.BatchNorm2D) or 
+                isinstance(sublayer,nn.LayerNorm)):
+                #print("SyncBN, BatchNorm2D, or LayerNorm")
+                #print("sublayer: ", sublayer)
+                sublayer.weight.optimize_attr['learning_rate'] = config.TRAIN.DECODER_LR_COEF
+                sublayer.bias.optimize_attr['learning_rate'] = config.TRAIN.DECODER_LR_COEF
+        if self.auxi_head == True:
+            sublayers = []  # list of list
+            sublayers.append(self.aux_decoder2.sublayers())
+            sublayers.append(self.aux_decoder3.sublayers())
+            sublayers.append(self.aux_decoder4.sublayers())
+            if self.decoder_type == "PUP_VisionTransformerUpHead":
+                 sublayers.append(self.aux_decoder5.sublayers())
+            #print("self.aux_decoders.sublayers(): ", sublayers)
+            for sublayer_list in sublayers:
+                for sublayer in sublayer_list:
+                    if isinstance(sublayer, nn.Conv2D):
+                        #print("sublayer: ", sublayer)
+                        sublayer.weight.optimize_attr['learning_rate'] = config.TRAIN.DECODER_LR_COEF
+                        if sublayer.bias is not None:
+                            sublayer.bias.optimize_attr['learning_rate'] = config.TRAIN.DECODER_LR_COEF
+
+
+    def forward(self, imgs):
+        # imgs.shapes: (B,3,H,W)
+        p2, p3, p4, p5 = self.encoder(imgs)
+        preds = []
+        if self.decoder_type == "VIT_MLAHead":
+            pred = self.decoder(p2, p3, p4, p5)
+        elif (self.decoder_type == "PUP_VisionTransformerUpHead" or 
+              self.decoder_type == "Naive_VisionTransformerUpHead"):
+            pred = self.decoder(p5)
+        preds.append(pred)
+        if self.auxi_head == True:
+            preds.append(self.aux_decoder2(p2))
+            preds.append(self.aux_decoder3(p3))
+            preds.append(self.aux_decoder4(p4))
+            if self.decoder_type == "PUP_VisionTransformerUpHead":
+                preds.append(self.aux_decoder5(p5))
+        return preds
+
diff --git a/semantic_segmentation/src/models/trans2seg.py b/semantic_segmentation/src/models/trans2seg.py
new file mode 100644
index 00000000..0cfeb6f4
--- /dev/null
+++ b/semantic_segmentation/src/models/trans2seg.py
@@ -0,0 +1,86 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from src.models.decoders.fcn_head import FCNHead
+from src.models.decoders import ConvBNReLU, SeparableConv2d, CNNHEAD, HybridEmbed
+from .backbones import get_segmentation_backbone, TransformerEncoder, TransformerDecoder, expand
+
+
+class Trans2Seg(nn.Layer):
+    """Trans2Seg Implement
+    
+    It contains cnn-encoder, transformer-encoder and transformer-decoder, and a small-cnn-head
+    Ref, https://arxiv.org/pdf/2101.08461.pdf
+
+    """
+    def __init__(self, config):
+        super(Trans2Seg, self).__init__()
+        c1_channels = 256
+        c4_channels = 2048
+        self.nclass = config.DATA.NUM_CLASSES
+        self.aux = config.TRAIN.LR_SCHEDULER.AUX
+        self.backbone = config.MODEL.ENCODER.TYPE.lower()
+        
+        # Create cnn encoder, the input image is fed to CNN to extract features
+        self.cnn_encoder = get_segmentation_backbone(self.backbone, config, nn.BatchNorm2D)
+        
+        # Get vit hyper params
+        vit_params = config.MODEL.TRANS2SEG
+        hid_dim = config.MODEL.TRANS2SEG.HID_DIM
+
+        c4_HxW = (config.DATA.CROP_SIZE[0] // 16) ** 2
+        vit_params['decoder_feat_HxW'] = c4_HxW
+
+        last_channels = vit_params['EMBED_DIM']
+
+        # create transformer encoder, for transformer encoder,
+        # the features and position embedding are flatten and fed to transformer for self-attention,
+        # and output feature(Fe) from transformer encoder.
+        self.transformer_encoder = TransformerEncoder(
+                                     embed_dim=last_channels,
+                                     depth=vit_params['DEPTH'],
+                                     num_heads=vit_params['NUM_HEADS'],
+                                     mlp_ratio=vit_params['MLP_RATIO'])
+        # create transformer decoder, for transformer decoder,
+        # for transformer decoder, we specifically define a set of learnable class prototype embeddings as query,
+        # the features from transformer encoder as key
+        self.transformer_decoder = TransformerDecoder(
+                                     embed_dim=last_channels,
+                                     depth=vit_params['DEPTH'],
+                                     num_heads=vit_params['NUM_HEADS'],
+                                     mlp_ratio=vit_params['MLP_RATIO'],
+                                     decoder_feat_HxW=vit_params['decoder_feat_HxW'])
+        # Create Hybrid Embedding
+        self.hybrid_embed = HybridEmbed(c4_channels, last_channels)
+        # Create small Conv Head, a small conv head to fuse attention map and Res2 feature from CNN backbone
+        self.cnn_head = CNNHEAD(vit_params, c1_channels=c1_channels, hid_dim=hid_dim)
+        
+        if self.aux:
+            self.auxlayer = FCNHead(in_channels=728, channels=728 // 4, num_classes=self.nclass)
+
+    def forward(self, x):
+        size = x.shape[2:]
+        c1, c2, c3, c4 = self.cnn_encoder(x)
+        outputs = list()
+        n, _, h, w = c4.shape
+        c4 = self.hybrid_embed(c4)
+        cls_token, c4 = self.transformer_encoder.forward_encoder(c4)
+        attns_list = self.transformer_decoder.forward_decoder(c4)
+        feat_enc = c4.reshape([n, h, w, -1]).transpose([0, 3, 1, 2])
+        
+        attn_map = attns_list[-1]
+        B, nclass, nhead, _ = attn_map.shape
+        _, _, H, W = feat_enc.shape
+        attn_map = attn_map.reshape([B*nclass, nhead, H, W])
+        x = paddle.concat([expand(feat_enc, nclass), attn_map], 1)
+        x = self.cnn_head(x, c1, nclass, B)
+        
+        x = F.interpolate(x, size, mode='bilinear', align_corners=True)
+
+        outputs.append(x)
+        if self.aux:
+            auxout = self.auxlayer(c3)
+            auxout = F.interpolate(auxout, size, mode='bilinear', align_corners=True)
+            outputs.append(auxout)
+        return tuple(outputs)
diff --git a/semantic_segmentation/src/models/upernet.py b/semantic_segmentation/src/models/upernet.py
new file mode 100644
index 00000000..a99f7eb6
--- /dev/null
+++ b/semantic_segmentation/src/models/upernet.py
@@ -0,0 +1,73 @@
+"""
+This module implements UperNet
+Unified Perceptual Parsing for Scene Understanding
+<https://arxiv.org/pdf/1807.10221.pdf>
+"""
+
+import math
+import paddle
+import paddle.nn as nn
+from src.models.backbones import SwinTransformer
+from src.models.decoders import UperHead, FCNHead
+
+
+class UperNet(nn.Layer):
+    """ UperNet
+
+    Attributes:
+        encoder: A backbone network for extract features from image.
+        auxi_head: A boolena indicating if we employ the auxilary segmentation head.
+        decoder_type: Type of decoder.
+        decoder: A decoder module for semantic segmentation.
+
+    """
+    def __init__(self, config):
+        super(UperNet, self).__init__()
+        if config.MODEL.ENCODER.TYPE == "SwinTransformer":
+            self.encoder = SwinTransformer(config)
+        self.num_layers = len(config.MODEL.TRANS.STAGE_DEPTHS)
+        self.auxi_head = config.MODEL.AUX.AUXIHEAD
+        self.decoder_type = config.MODEL.DECODER_TYPE
+        self.backbone_out_indices = config.MODEL.ENCODER.OUT_INDICES
+
+        assert self.decoder_type == "UperHead", "only support UperHead decoder"
+        self.num_features =[]
+        for i in range(self.num_layers):
+            self.num_features.append(int(config.MODEL.TRANS.EMBED_DIM * 2 ** i))
+        self.layer_norms = nn.LayerList()
+        for idx in self.backbone_out_indices:
+           self.layer_norms.append(nn.LayerNorm(self.num_features[idx]))
+        self.decoder = UperHead(
+            pool_scales=config.MODEL.UPERHEAD.POOL_SCALES, 
+            in_channels=config.MODEL.UPERHEAD.IN_CHANNELS,
+            channels=config.MODEL.UPERHEAD.CHANNELS,
+            align_corners=config.MODEL.UPERHEAD.ALIGN_CORNERS,
+            num_classes=config.DATA.NUM_CLASSES)
+        self.auxi_head = config.MODEL.AUX.AUXIHEAD
+        if self.auxi_head == True:
+            self.aux_decoder = FCNHead(
+                in_channels=config.MODEL.AUXFCN.IN_CHANNELS, 
+                num_classes=config.DATA.NUM_CLASSES, 
+                up_ratio=config.MODEL.AUXFCN.UP_RATIO) 
+        self.init__decoder_lr_coef(config)
+    
+    def init__decoder_lr_coef(self, config):
+        pass
+
+    def to_2D(self, x):
+        n, hw, c = x.shape                                                                                                                                    
+        h = w = int(math.sqrt(hw))
+        x = x.transpose([0, 2, 1]).reshape([n, c, h, w])
+        return x
+
+    def forward(self, imgs):
+        # imgs.shapes: (B,3,H,W)
+        feats = self.encoder(imgs)
+        for idx in self.backbone_out_indices:
+            feat = self.layer_norms[idx](feats[idx])
+            feats[idx] = self.to_2D(feat)
+        p2, p3, p4, p5 = feats
+        preds = [self.decoder([p2, p3, p4, p5])]
+        preds.append(self.aux_decoder(p4))
+        return preds
+
diff --git a/semantic_segmentation/src/transforms/__init__.py b/semantic_segmentation/src/transforms/__init__.py
new file mode 100644
index 00000000..c4f7d86e
--- /dev/null
+++ b/semantic_segmentation/src/transforms/__init__.py
@@ -0,0 +1,2 @@
+from .transforms import *
+from . import functional
diff --git a/semantic_segmentation/src/transforms/functional.py b/semantic_segmentation/src/transforms/functional.py
new file mode 100644
index 00000000..5bd0c18d
--- /dev/null
+++ b/semantic_segmentation/src/transforms/functional.py
@@ -0,0 +1,87 @@
+import cv2
+import numpy as np
+from PIL import Image, ImageEnhance
+from scipy.ndimage.morphology import distance_transform_edt
+
+
+def normalize(img, mean, std):
+    img = img.astype(np.float32, copy=False) / 255.0
+    img -= mean
+    img /= std
+    return img
+
+def imnormalize(img, mean, std):
+    """Normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    img = img.copy().astype(np.float32)
+    return imnormalize_(img, mean, std)
+
+def imnormalize_(img, mean, std):
+    """Inplace normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized. (0~255)
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    # cv2 inplace normalization does not accept uint8
+    assert img.dtype != np.uint8
+    mean = np.float64(mean.reshape(1, -1))
+    stdinv = 1 / np.float64(std.reshape(1, -1))
+    cv2.subtract(img, mean, img)  # inplace
+    cv2.multiply(img, stdinv, img)  # inplace
+    return img
+
+def horizontal_flip(img):
+    if len(img.shape) == 3:
+        img = img[:, ::-1, :]
+    elif len(img.shape) == 2:
+        img = img[:, ::-1]
+    return img
+
+def vertical_flip(img):
+    if len(img.shape) == 3:
+        img = img[::-1, :, :]
+    elif len(img.shape) == 2:
+        img = img[::-1, :]
+    return img
+
+def brightness(img, brightness_lower, brightness_upper):
+    brightness_delta = np.random.uniform(brightness_lower, brightness_upper)
+    img = ImageEnhance.Brightness(img).enhance(brightness_delta)
+    return img
+
+def contrast(img, contrast_lower, contrast_upper):
+    contrast_delta = np.random.uniform(contrast_lower, contrast_upper)
+    img = ImageEnhance.Contrast(img).enhance(contrast_delta)
+    return img
+
+def saturation(img, saturation_lower, saturation_upper):
+    saturation_delta = np.random.uniform(saturation_lower, saturation_upper)
+    img = ImageEnhance.Color(img).enhance(saturation_delta)
+    return img
+
+def hue(img, hue_lower, hue_upper):
+    hue_delta = np.random.uniform(hue_lower, hue_upper)
+    img = np.array(img.convert('HSV'))
+    img[:, :, 0] = img[:, :, 0] + hue_delta
+    img = Image.fromarray(img, mode='HSV').convert('RGB')
+    return img
+
+def rotate(img, rotate_lower, rotate_upper):
+    rotate_delta = np.random.uniform(rotate_lower, rotate_upper)
+    img = img.rotate(int(rotate_delta))
+    return img
diff --git a/semantic_segmentation/src/transforms/transforms.py b/semantic_segmentation/src/transforms/transforms.py
new file mode 100644
index 00000000..bb1ac4c1
--- /dev/null
+++ b/semantic_segmentation/src/transforms/transforms.py
@@ -0,0 +1,633 @@
+import random
+import numpy as np
+import cv2
+from PIL import Image
+from paddle.vision.transforms import functional as F
+from src.transforms import functional
+
+
+class Compose:
+    """
+    Do transformation on input data with corresponding pre-processing and 
+    augmentation operations. The shape of input data to all operations is 
+    [height, width, channels].
+
+    Args:
+        transforms (list): A list contains data pre-processing or augmentation. 
+        Empty list means only reading images, no transformation.
+        to_rgb (bool, optional): If converting image to RGB color space.
+        Default: True.
+
+    Raises:
+        TypeError: When 'transforms' is not a list.
+        ValueError: when the length of 'transforms' is less than 1.
+    """
+
+    def __init__(self, transforms, to_rgb=True):
+        if not isinstance(transforms, list):
+            raise TypeError('The transforms must be a list!')
+        self.transforms = transforms
+        self.to_rgb = to_rgb
+
+    def __call__(self, img, label=None):
+        """
+        Args:
+            img (str|np.ndarray): It is either image path or image object.
+            label (str|np.ndarray): It is either label path or label ndarray.
+
+        Returns:
+            (tuple). A tuple including image and label after transformation.
+        """
+        if isinstance(img, str):
+            img = cv2.imread(img).astype('float32')
+        if isinstance(label, str):
+            label = np.asarray(Image.open(label).convert('P'), dtype=np.uint8)
+        if img is None:
+            raise ValueError('Can\'t read The image file {}!'.format(img))
+        if self.to_rgb:
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB,img)
+
+        for op in self.transforms:
+            outputs = op(img, label)
+            img = outputs[0]
+            if len(outputs) == 2:
+                label = outputs[1]
+        img = np.transpose(img, (2, 0, 1))
+        return (img, label)
+
+
+class RandomHorizontalFlip:
+    """
+    Flip an image horizontally with a certain probability.
+
+    Args:
+        prob (float, optional): A probability of horizontally flipping. 
+    """
+
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, img, label=None):
+        if random.random() < self.prob:
+            img = functional.horizontal_flip(img)
+            if label is not None:
+                label = functional.horizontal_flip(label)
+        if label is None:
+            return (img, )
+        else:
+            return (img, label)
+
+class RandomVerticalFlip:
+    """
+    Flip an image vertically with a certain probability.
+
+    Args:
+        prob (float, optional): A probability of vertical flipping. Default: 0.1.
+    """
+
+    def __init__(self, prob=0.1):
+        self.prob = prob
+
+    def __call__(self, img, label=None):
+        if random.random() < self.prob:
+            img = functional.vertical_flip(img)
+            if label is not None:
+                label = functional.vertical_flip(label)
+        if label is None:
+            return (img, )
+        else:
+            return (img, label)
+
+class Resize:
+    """
+    Resize an image. If size is a sequence like (h, w), output size will be 
+    matched to this. If size is an int, smaller edge of the image will be 
+    matched to this number. i.e, if height > width, then image will be 
+    rescaled to (size * height / width, size).
+
+    Args:
+        target_size (list|tuple|int, optional): The target size of image.
+        interp (str, optional): The interpolation mode of resize is consistent
+        with opencv. ['NEAREST', 'LINEAR', 'CUBIC', 'AREA', 'LANCZOS4', 'RANDOM']. 
+        Note that when it is 'RANDOM', a random interpolation mode would be specified. 
+
+    Raises:
+        TypeError: When 'target_size' type is neither list nor tuple.
+        ValueError: When "interp" is out of pre-defined methods ('NEAREST', 
+        'LINEAR', 'CUBIC', 'AREA', 'LANCZOS4', 'RANDOM').
+    """
+
+    def __init__(self, target_size=520, interp='LINEAR', keep_ori_size=False):
+        self.interp = interp
+        self.keep_ori_size = keep_ori_size
+
+        if isinstance(target_size, int):
+            assert target_size>0
+        elif isinstance(target_size, list) or isinstance(target_size, tuple):
+            if len(target_size) != 2:
+                raise ValueError("`target_size` should include 2 elements, "
+                                 "but it is {}".format(target_size))
+        else:
+            raise TypeError(
+                "Type of `target_size` is invalid. It should be list or tuple, "
+                "but it is {}".format(type(target_size)))
+        self.target_size = target_size
+
+    def __call__(self, img, label=None):
+        """
+        Args:
+            img (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it 
+            returns (im, label),
+
+        Raises:
+            TypeError: When the 'img' type is not numpy.
+            ValueError: When the length of "im" shape is not 3.
+        """
+
+        if not isinstance(img, np.ndarray):
+            raise TypeError("Resize: image type is not numpy.")
+        if len(img.shape) != 3:
+            raise ValueError('Resize: image is not 3-dimensional.')
+        if self.interp == "RANDOM":
+            interp = random.choice(list(self.interp_dict.keys()))
+        else:
+            interp = self.interp
+        if not self.keep_ori_size:
+            img = F.resize(img, self.target_size, 'bilinear')
+
+        if label is not None:
+            label = F.resize(label, self.target_size,'nearest')
+
+        if label is None:
+            return (img, )
+        else:
+            return (img, label)
+
+class ResizeStepScaling:
+    """
+    Scale an image proportionally within a range.
+
+    Args:
+        min_scale_factor (float, optional): The minimum scale. Default: 0.75.
+        max_scale_factor (float, optional): The maximum scale. Default: 1.25.
+        scale_step_size (float, optional): The scale interval. Default: 0.25.
+
+    Raises:
+        ValueError: When min_scale_factor is smaller than max_scale_factor.
+    """
+
+    def __init__(self,
+                 min_scale_factor=0.75,
+                 max_scale_factor=1.25,
+                 scale_step_size=0.25):
+        if min_scale_factor > max_scale_factor:
+            raise ValueError(
+                "min_scale_factor must be less than max_scale_factor, "
+                "but they are {} and {}.".format(min_scale_factor,
+                                                 max_scale_factor))
+        self.min_scale_factor = min_scale_factor
+        self.max_scale_factor = max_scale_factor
+        self.scale_step_size = scale_step_size
+
+    def __call__(self, img, label=None):
+        """
+        Args:
+            img (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (img, ), otherwise it 
+            returns (img, label).
+        """
+
+        if self.min_scale_factor == self.max_scale_factor:
+            scale_factor = self.min_scale_factor
+
+        elif self.scale_step_size == 0:
+            scale_factor = np.random.uniform(self.min_scale_factor,
+                                             self.max_scale_factor)
+
+        else:
+            # option 1
+            scale_factor = np.random.random_sample() * (self.max_scale_factor 
+                - self.min_scale_factor) + self.min_scale_factor
+            # option 2
+            #num_steps = int((self.max_scale_factor - self.min_scale_factor) /self.scale_step_size + 1)
+            #scale_factors = np.linspace(self.min_scale_factor,self.max_scale_factor, num_steps).tolist()
+            #np.random.shuffle(scale_factors)
+            #scale_factor = scale_factors[0]
+        w = int(round(scale_factor * img.shape[1]))
+        h = int(round(scale_factor * img.shape[0]))
+        img = F.resize(img, (w, h), 'bilinear')
+        if label is not None:
+            label = F.resize(label, (w, h), 'nearest')
+        if label is None:
+            return (img, )
+        else:
+            return (img, label)
+
+class Normalize:
+    """
+    Normalize an image.
+
+    Args:
+        mean (list, optional): The mean value of a dataset. Default: 
+        [0.5, 0.5, 0.5].
+        std (list, optional): The standard deviation of a dataset. Default: 
+        [0.5, 0.5, 0.5].
+
+    Raises:
+        ValueError: When mean/std is not list or any value in std is 0.
+    """
+
+    def __init__(self, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)):
+        self.mean = mean
+        self.std = std
+        if not (isinstance(self.mean, (list, tuple))
+                and isinstance(self.std, (list, tuple))):
+            raise ValueError("{}: input type is invalid. It should be list or "
+                             "tuple".format(self))
+        from functools import reduce
+        if reduce(lambda x, y: x * y, self.std) == 0:
+            raise ValueError('{}: std is invalid!'.format(self))
+
+    def __call__(self, img, label=None):
+        """
+        Args:
+            img (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (img, ), otherwise it 
+            returns (im, label).
+        """
+
+        mean = np.array(self.mean).reshape(1,-1)
+        std = np.array(self.std).reshape(1,-1)
+        # option 1
+        #img = functional.normalize(img, mean, std)
+        # option 2
+        img = functional.imnormalize(img, mean, std)
+        if label is None:
+            return (img, )
+        else:
+            return (img, label)
+
+class Padding:
+    """
+    Add bottom-right padding to a raw image or annotation image.
+
+    Args:
+        target_size (list|tuple): The target size after padding.
+        im_padding_value (list, optional): The padding value of raw image.
+        Default: [127.5, 127.5, 127.5].
+        label_padding_value (int, optional): The padding value of annotation
+        image. Default: 255.
+
+    Raises:
+        TypeError: When target_size is neither list nor tuple.
+        ValueError: When the length of target_size is not 2.
+    """
+
+    def __init__(self,
+                 target_size,
+                 im_padding_value=(127.5, 127.5, 127.5),
+                 label_padding_value=255):
+        if isinstance(target_size, list) or isinstance(target_size, tuple):
+            if len(target_size) != 2:
+                raise ValueError(
+                    "`target_size` should include 2 elements, but it is {}".
+                    format(target_size))
+        else:
+            raise TypeError("Type of target_size is invalid. It should be list "
+                            "or tuple, now is {}".format(type(target_size)))
+        self.target_size = target_size
+        self.im_padding_value = im_padding_value
+        self.label_padding_value = label_padding_value
+
+    def __call__(self, img, label=None):
+        """
+        Args:
+            img (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple): When label is None, it returns (img, ), otherwise it 
+            returns (img, label).
+        """
+
+        img_height, img_width = img.shape[0], img.shape[1]
+        if isinstance(self.target_size, int):
+            target_height = self.target_size
+            target_width = self.target_size
+        else:
+            target_height = self.target_size[1]
+            target_width = self.target_size[0]
+        pad_height = target_height - img_height
+        pad_width = target_width - img_width
+        if pad_height < 0 or pad_width < 0:
+            raise ValueError("The size of image should be less than `target_size`, "
+                             "but the size of image ({}, {}) is larger than `target_size` "
+                             "({}, {})".format(img_width, img_height, target_width, target_height))
+        else:
+            img = cv2.copyMakeBorder(
+                img, 0, pad_height, 0, pad_width, cv2.BORDER_CONSTANT,
+                value=self.im_padding_value)
+            if label is not None:
+                label = cv2.copyMakeBorder(
+                    label, 0, pad_height, 0, pad_width, cv2.BORDER_CONSTANT,
+                    value=self.label_padding_value)
+        if label is None:
+            return (img, )
+        else:
+            return (img, label)
+
+class RandomPaddingCrop:
+    """
+    Crop a sub-image from a raw image and annotation image randomly. If the 
+    target cropping siz is larger than original image, then the bottom-right 
+    padding will be added.
+
+    Args:
+        crop_size (tuple, optional): The target cropping size.
+        img_padding_value (list, optional): The padding value of raw image.
+        Default: (123.675, 116.28, 103.53).
+        label_padding_value (int, optional): The padding value of annotation
+        image. Default: 255.
+
+    Raises:
+        TypeError: When crop_size is neither list nor tuple.
+        ValueError: When the length of crop_size is not 2.
+    """
+
+    def __init__(self,
+                 crop_size=(512, 512),
+                 img_padding_value=(123.675, 116.28, 103.53),
+                 label_padding_value=255):
+        if isinstance(crop_size, list) or isinstance(crop_size, tuple):
+            if len(crop_size) != 2:
+                raise ValueError("Type of `crop_size` is list or tuple. It "
+                                 "should include 2 elements, but it is {}"
+                                 .format(crop_size))
+        else:
+            raise TypeError("The type of `crop_size` is invalid. It should "
+                            "be list or tuple, but it is {}"
+                            .format(type(crop_size)))
+        self.crop_size = crop_size
+        self.img_padding_value = img_padding_value
+        self.label_padding_value = label_padding_value
+
+    def __call__(self, img, label=None):
+        """
+        Args:
+            img (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple): When label is None, it returns (img, ), otherwise it 
+            returns (img, label).
+        """
+
+        if isinstance(self.crop_size, int):
+            crop_width = self.crop_size
+            crop_height = self.crop_size
+        else:
+            crop_width = self.crop_size[0]
+            crop_height = self.crop_size[1]
+
+        img_height = img.shape[0]
+        img_width = img.shape[1]
+
+        if img_height == crop_height and img_width == crop_width:
+            if label is None:
+                return (img, )
+            else:
+                return (img, label)
+        else:
+            pad_height = max(crop_height - img_height, 0)
+            pad_width = max(crop_width - img_width, 0)
+            if (pad_height > 0 or pad_width > 0):
+                img = cv2.copyMakeBorder(
+                    img, 0, pad_height, 0, pad_width, cv2.BORDER_CONSTANT, 
+                    value=self.img_padding_value)
+                if label is not None:
+                    label = cv2.copyMakeBorder(
+                        label, 0, pad_height, 0, pad_width, cv2.BORDER_CONSTANT, 
+                        value=self.label_padding_value)
+                img_height = img.shape[0]
+                img_width = img.shape[1]
+
+            if crop_height > 0 and crop_width > 0:
+                h_off = np.random.randint(img_height - crop_height + 1)
+                w_off = np.random.randint(img_width - crop_width + 1)
+
+                img = img[h_off:(crop_height + h_off), w_off:(w_off + crop_width), :]
+                if label is not None:
+                    label = label[h_off:(crop_height + h_off), w_off:(w_off + crop_width)]
+        if label is None:
+            return (img, )
+        else:
+            return (img, label)
+
+class RandomBlur:
+    """
+    Blurring an image by a Gaussian function with a certain probability.
+
+    Args:
+        prob (float, optional): A probability of blurring an image. Default: 0.1.
+    """
+
+    def __init__(self, prob=0.1):
+        self.prob = prob
+
+    def __call__(self, img, label=None):
+        """
+        Args:
+            img (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (img, ), otherwise 
+            it returns (img, label).
+        """
+
+        if self.prob <= 0:
+            n = 0
+        elif self.prob >= 1:
+            n = 1
+        else:
+            n = int(1.0 / self.prob)
+        if n > 0:
+            if np.random.randint(0, n) == 0:
+                radius = np.random.randint(3, 10)
+                if radius % 2 != 1:
+                    radius = radius + 1
+                if radius > 9:
+                    radius = 9
+                img = cv2.GaussianBlur(img, (radius, radius), 0, 0)
+
+        if label is None:
+            return (img, )
+        else:
+            return (img, label)
+
+class RandomRotation:
+    """
+    Rotate an image randomly with padding.
+
+    Args:
+        max_rotation (float, optional): The maximum rotation degree. Default: 15.
+        img_padding_value (list, optional): The padding value of raw image.
+        Default: [127.5, 127.5, 127.5].
+        label_padding_value (int, optional): The padding value of annotation
+        image. Default: 255.
+    """
+
+    def __init__(self,
+                 max_rotation=15,
+                 img_padding_value=(127.5, 127.5, 127.5),
+                 label_padding_value=255):
+        self.max_rotation = max_rotation
+        self.img_padding_value = img_padding_value
+        self.label_padding_value = label_padding_value
+
+    def __call__(self, img, label=None):
+        """
+        Args:
+            img (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple): When label is None, it returns (img, ), otherwise 
+            it returns (img, label).
+        """
+
+        if self.max_rotation > 0:
+            (h, w) = img.shape[:2]
+            do_rotation = np.random.uniform(-self.max_rotation,
+                                            self.max_rotation)
+            pc = (w // 2, h // 2)
+            r = cv2.getRotationMatrix2D(pc, do_rotation, 1.0)
+            cos = np.abs(r[0, 0])
+            sin = np.abs(r[0, 1])
+
+            nw = int((h * sin) + (w * cos))
+            nh = int((h * cos) + (w * sin))
+
+            (cx, cy) = pc
+            r[0, 2] += (nw / 2) - cx
+            r[1, 2] += (nh / 2) - cy
+            dsize = (nw, nh)
+            img = cv2.warpAffine(
+                img, r, dsize=dsize, flags=cv2.INTER_LINEAR, 
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=self.im_padding_value)
+            if label is not None:
+                label = cv2.warpAffine(
+                    label, r, dsize=dsize, flags=cv2.INTER_NEAREST,
+                    borderMode=cv2.BORDER_CONSTANT,
+                    borderValue=self.label_padding_value)
+
+        if label is None:
+            return (img, )
+        else:
+            return (img, label)
+
+class RandomDistort:
+    """
+    Distort an image with random configurations.
+
+    Args:
+        brightness_range (float, optional): The range of brightness.
+        brightness_prob (float, optional): The probability of adjusting brightness.
+        contrast_range (float, optional): The range of contrast.
+        contrast_prob (float, optional): The probability of adjusting contrast.
+        saturation_range (float, optional): The range of saturation.
+        saturation_prob (float, optional): The probability of adjusting saturation.
+        hue_range (int, optional): The range of hue.
+        hue_prob (float, optional): The probability of adjusting hue.
+    """
+
+    def __init__(self,
+                 brightness_range=0.5,
+                 brightness_prob=0.5,
+                 contrast_range=0.5,
+                 contrast_prob=0.5,
+                 saturation_range=0.5,
+                 saturation_prob=0.5,
+                 hue_range=18,
+                 hue_prob=0.5):
+        self.brightness_range = brightness_range
+        self.brightness_prob = brightness_prob
+        self.contrast_range = contrast_range
+        self.contrast_prob = contrast_prob
+        self.saturation_range = saturation_range
+        self.saturation_prob = saturation_prob
+        self.hue_range = hue_range
+        self.hue_prob = hue_prob
+
+    def __call__(self, img, label=None):
+        """
+        Args:
+            img (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple): When label is None, it returns (img, ), 
+            otherwise it returns (img, label).
+        """
+
+        brightness_lower = 1 - self.brightness_range
+        brightness_upper = 1 + self.brightness_range
+        contrast_lower = 1 - self.contrast_range
+        contrast_upper = 1 + self.contrast_range
+        saturation_lower = 1 - self.saturation_range
+        saturation_upper = 1 + self.saturation_range
+        hue_lower = -self.hue_range
+        hue_upper = self.hue_range
+        ops = [
+            functional.brightness, functional.contrast, functional.saturation,
+            functional.hue
+        ]
+        random.shuffle(ops)
+        params_dict = {
+            'brightness': {
+                'brightness_lower': brightness_lower,
+                'brightness_upper': brightness_upper
+            },
+            'contrast': {
+                'contrast_lower': contrast_lower,
+                'contrast_upper': contrast_upper
+            },
+            'saturation': {
+                'saturation_lower': saturation_lower,
+                'saturation_upper': saturation_upper
+            },
+            'hue': {
+                'hue_lower': hue_lower,
+                'hue_upper': hue_upper
+            }
+        }
+        prob_dict = {
+            'brightness': self.brightness_prob,
+            'contrast': self.contrast_prob,
+            'saturation': self.saturation_prob,
+            'hue': self.hue_prob
+        }
+        img = img.astype('uint8')
+        img = Image.fromarray(img)
+        for id in range(len(ops)):
+            params = params_dict[ops[id].__name__]
+            prob = prob_dict[ops[id].__name__]
+            params['img'] = img
+            if np.random.uniform(0, 1) < prob:
+                img = ops[id](**params)
+        img = np.asarray(img).astype('float32')
+        if label is None:
+            return (img, )
+        else:
+            return (img, label)
diff --git a/semantic_segmentation/src/utils/__init__.py b/semantic_segmentation/src/utils/__init__.py
new file mode 100644
index 00000000..e18e0c0b
--- /dev/null
+++ b/semantic_segmentation/src/utils/__init__.py
@@ -0,0 +1,5 @@
+from . import logger
+from . import metrics
+from .checkpoint import load_entire_model, load_pretrained_model, resume
+from .timer import TimeAverager, calculate_eta
+from . import vis
diff --git a/semantic_segmentation/src/utils/checkpoint.py b/semantic_segmentation/src/utils/checkpoint.py
new file mode 100644
index 00000000..0ca530f8
--- /dev/null
+++ b/semantic_segmentation/src/utils/checkpoint.py
@@ -0,0 +1,101 @@
+import math
+import os
+import paddle.nn.functional as F
+import paddle
+from src.utils import logger
+
+def load_entire_model(model, pretrained):
+    """
+    Load the weights of the whole model
+    
+    Arges:
+       model: model based paddle
+       pretrained: the path of weight file of model 
+    """
+
+    if pretrained is not None:
+        load_pretrained_model(model, pretrained)
+    else:
+        logger.warning('Not all pretrained params of {} are loaded, ' \
+                       'training from scratch or a pretrained backbone.'.format(
+                       model.__class__.__name__))
+
+
+def load_pretrained_model(model, pretrained_model, pos_embed_interp=True):
+    if pretrained_model is not None:
+        logger.info('Loading pretrained model from {}'.format(pretrained_model))
+        if os.path.exists(pretrained_model):
+            para_state_dict = paddle.load(pretrained_model)
+            model_state_dict = model.state_dict()
+            keys = model_state_dict.keys()
+            # 4 debug
+            #print("pretrained para_state_dict.len: ", len(para_state_dict.keys()))
+            #print("current model weight.len: ",len(keys))
+            match_list=[]
+            not_match_list=[]
+            num_params_loaded = 0
+            for k in keys:
+                if k not in para_state_dict:
+                    logger.warning("{} is not in pretrained model".format(k))
+                    not_match_list.append(k)
+                elif list(para_state_dict[k].shape) != list(model_state_dict[k].shape):
+                    if pos_embed_interp==True:
+                        n, pretrain_num_patches, c = para_state_dict[k].shape  # pretrain_num_patches=hw+1
+                        n, cur_model_num_patches, c = model_state_dict[k].shape
+                        h = w = int(math.sqrt(pretrain_num_patches))
+                        pos_embed_weight = para_state_dict[k][:, (-h * w):]  # (n,hw,c)
+                        pos_embed_weight = pos_embed_weight.transpose([0,2,1])  # (n,c,hw)
+                        pos_embed_weight = pos_embed_weight.reshape([n, c, h, w])  # type: numpy
+                        pos_embed_weight = paddle.to_tensor(pos_embed_weight)
+                        cur_h=int(math.sqrt(cur_model_num_patches))
+                        cur_pos_embed_weight = F.interpolate(pos_embed_weight, size=(cur_h, cur_h), mode='bilinear', align_corners=False)
+                        cur_pos_embed_weight = cur_pos_embed_weight.reshape([n, c, -1]).transpose([0,2,1])
+                        cls_token_weight = para_state_dict[k][:, 0]
+                        cls_token_weight = paddle.to_tensor(cls_token_weight).unsqueeze(1)
+                        model_state_dict[k] = paddle.concat((cls_token_weight, cur_pos_embed_weight), axis=1).numpy()
+                        num_params_loaded += 1
+                        match_list.append(k)
+                    else:
+                        logger.warning("[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})"
+                            .format(k, para_state_dict[k].shape,
+                            model_state_dict[k].shape))
+                    
+                else:
+                    model_state_dict[k] = para_state_dict[k]
+                    num_params_loaded += 1
+                    match_list.append(k)
+
+            model.set_dict(model_state_dict)
+            logger.info("There are {}/{} variables loaded into {}.".format(
+                num_params_loaded, len(model_state_dict), model.__class__.__name__))
+            logger.info(" {} parameters is matched, and {} parameters is Not matched".format(
+                len(match_list), len(not_match_list)))
+
+        else:
+            raise ValueError('The pretrained model directory is not Found: {}'.format(
+                    pretrained_model))
+    else:
+        logger.info('No pretrained model to load, {} will be trained from scratch.'.
+            format(model.__class__.__name__))
+
+
+def resume(model, optimizer, resume_model):
+    if resume_model is not None:
+        logger.info('Resume model from {}'.format(resume_model))
+        if os.path.exists(resume_model):
+            resume_model = os.path.normpath(resume_model)
+            ckpt_path = os.path.join(resume_model, 'model.pdparams')
+            para_state_dict = paddle.load(ckpt_path)
+            ckpt_path = os.path.join(resume_model, 'model.pdopt')
+            opti_state_dict = paddle.load(ckpt_path)
+            model.set_state_dict(para_state_dict)
+            optimizer.set_state_dict(opti_state_dict)
+
+            iter = resume_model.split('_')[-1]
+            iter = int(iter)
+            return iter
+        else:
+            raise ValueError('Directory of the model needed to resume is not Found: {}'.
+                format(resume_model))
+    else:
+        logger.info('No model needed to resume.')
diff --git a/semantic_segmentation/src/utils/logger.py b/semantic_segmentation/src/utils/logger.py
new file mode 100644
index 00000000..a3b7e372
--- /dev/null
+++ b/semantic_segmentation/src/utils/logger.py
@@ -0,0 +1,30 @@
+import sys
+import time
+import paddle
+
+levels = {0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG'}
+log_level = 2
+
+def log(level=2, message=""):
+    if paddle.distributed.ParallelEnv().local_rank == 0:
+        current_time = time.time()
+        time_array = time.localtime(current_time)
+        current_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
+        if log_level >= level:
+            print("{} [{}]\t{}".format(
+                current_time, 
+                levels[level], 
+                message).encode("utf-8").decode("latin1"))
+            sys.stdout.flush()
+
+def debug(message=""):
+    log(level=3, message=message)
+
+def info(message=""):
+    log(level=2, message=message)
+
+def warning(message=""):
+    log(level=1, message=message)
+
+def error(message=""):
+    log(level=0, message=message)
diff --git a/semantic_segmentation/src/utils/metrics.py b/semantic_segmentation/src/utils/metrics.py
new file mode 100644
index 00000000..644989ca
--- /dev/null
+++ b/semantic_segmentation/src/utils/metrics.py
@@ -0,0 +1,133 @@
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+
+def calculate_area(pred, label, num_classes, ignore_index=255):
+    """
+    Calculate intersect, prediction and label area
+
+    Args:
+        pred (type: Tensor, shape: [B,1,H,W]):  prediction results.
+        label (type: Tensor, shape: [B,1,H,W]): ground truth (segmentation)
+        num_classes (int): The unique number of target classes.
+        ignore_index (int): Specifies a class that is ignored. Default: 255.
+
+    Returns:
+        Tensor: The intersection area of prediction and the ground on all class.
+        Tensor: The prediction area on all class.
+        Tensor: The ground truth area on all class.
+    """
+
+    if len(pred.shape) == 4:
+        pred = paddle.squeeze(pred, axis=1)
+    if len(label.shape) == 4:
+        label = paddle.squeeze(label, axis=1)
+    if not pred.shape == label.shape:
+        raise ValueError('Shape of `pred` and `label should be equal, '
+                         'but there are {} and {}.'.format(pred.shape, label.shape))
+
+    # Delete ignore_index
+    mask = label != ignore_index
+    pred = pred + 1
+    label = label + 1
+    pred = pred * mask
+    label = label * mask
+    pred = F.one_hot(pred, num_classes + 1)  # dtype: float32
+    label = F.one_hot(label, num_classes + 1)
+    pred = pred[:, :, :, 1:]  # shape: [1,H,W,num_class+1]
+    label = label[:, :, :, 1:]
+    pred_area = []
+    label_area = []
+    intersect_area = []
+    for i in range(num_classes):
+        pred_i = pred[:, :, :, i]
+        label_i = label[:, :, :, i]
+        pred_area_i = paddle.sum(pred_i)
+        label_area_i = paddle.sum(label_i)
+        intersect_area_i = paddle.sum(pred_i * label_i)
+        pred_area.append(pred_area_i)
+        label_area.append(label_area_i)
+        intersect_area.append(intersect_area_i)
+    pred_area = paddle.concat(pred_area)
+    label_area = paddle.concat(label_area)
+    intersect_area = paddle.concat(intersect_area)
+    return intersect_area, pred_area, label_area
+
+def mean_iou(intersect_area, pred_area, label_area):
+    """
+    Calculate iou.
+
+    Args:
+        intersect_area (Tensor): The intersection area of prediction and ground
+        truth on all classes.
+        pred_area (Tensor): The prediction area on all classes.
+        label_area (Tensor): The ground truth area on all classes.
+
+    Returns:
+        class_iou (np.ndarray): iou on all classes.
+        mean_iou (float): mean iou of all classes.
+    """
+
+    intersect_area = intersect_area.numpy()
+    pred_area = pred_area.numpy()
+    label_area = label_area.numpy()
+    union = pred_area + label_area - intersect_area
+    class_iou = []
+    for i in range(len(intersect_area)):
+        if union[i] == 0:
+            iou = 0
+        else:
+            iou = 1.0*intersect_area[i] / union[i]
+        class_iou.append(iou)
+    mean_iou = np.mean(class_iou)
+    return np.array(class_iou), mean_iou
+
+def accuracy(intersect_area, pred_area):
+    """
+    Calculate accuracy
+
+    Args:
+        intersect_area (Tensor): The intersection area of prediction and ground
+        truth on all classeds.
+        pred_area (Tensor): The prediction area on all classes.
+
+    Returns:
+        class_acc (np.ndarray): accuracy on all classes.
+        mean_acc (float): mean accuracy.
+    """
+
+    intersect_area = intersect_area.numpy()
+    pred_area = pred_area.numpy()
+    class_acc = []
+    for i in range(len(intersect_area)):
+        if pred_area[i] == 0:
+            acc = 0
+        else:
+            acc = intersect_area[i] / pred_area[i]
+        class_acc.append(acc)
+    mean_acc = np.sum(intersect_area) / np.sum(pred_area)
+    return np.array(class_acc), mean_acc
+
+def kappa(intersect_area, pred_area, label_area):
+    """
+    Calculate kappa coefficient
+
+    Args:
+        intersect_area (Tensor): The intersection area of prediction and ground
+        truth on all classes.
+        pred_area (Tensor): The prediction area on all classes.
+        label_area (Tensor): The ground truth area on all classes.
+
+    Returns:
+        kappa (float): kappa coefficient.
+    """
+
+    intersect_area = intersect_area.numpy()
+    pred_area = pred_area.numpy()
+    label_area = label_area.numpy()
+    total_area = np.sum(label_area)
+    po = np.sum(intersect_area) / total_area
+    pe = np.sum(pred_area * label_area) / (total_area * total_area)
+    kappa = (po - pe) / (1 - pe)
+    return kappa
diff --git a/semantic_segmentation/src/utils/progbar.py b/semantic_segmentation/src/utils/progbar.py
new file mode 100644
index 00000000..e639bce4
--- /dev/null
+++ b/semantic_segmentation/src/utils/progbar.py
@@ -0,0 +1,195 @@
+import os
+import sys
+import time
+import numpy as np
+
+
+class Progbar(object):
+    """
+    Displays a progress bar.
+        It refers to https://github.com/keras-team/keras/blob/keras-2/keras/utils/generic_utils.py
+
+    Args:
+        target (int): Total number of steps expected, None if unknown.
+        width (int): Progress bar width on screen.
+        verbose (int): Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
+        interval (float): Minimum visual progress update interval (in seconds).
+        stateful_metrics (list|tuple): Iterable of string names of metrics that
+        should not be averaged over time. Metrics in this list will be displayed as-is. All 
+        others will be averaged by the progbar before display.
+        unit_name (str): Display name for step counts (usually "step" or "sample").
+    """
+
+    def __init__(self,
+                 target,
+                 width=30,
+                 verbose=1,
+                 interval=0.05,
+                 stateful_metrics=None,
+                 unit_name='step'):
+        self.target = target
+        self.width = width
+        self.verbose = verbose
+        self.interval = interval
+        self.unit_name = unit_name
+        if stateful_metrics:
+            self.stateful_metrics = set(stateful_metrics)
+        else:
+            self.stateful_metrics = set()
+
+        self._dynamic_display = ((hasattr(sys.stderr, 'isatty')
+                                  and sys.stderr.isatty())
+                                  or 'ipykernel' in sys.modules
+                                  or 'posix' in sys.modules
+                                  or 'PYCHARM_HOSTED' in os.environ)
+        self._total_width = 0
+        self._seen_so_far = 0
+        # We use a dict + list to avoid garbage collection
+        # issues found in OrderedDict
+        self._values = {}
+        self._values_order = []
+        self._start = time.time()
+        self._last_update = 0
+
+    def update(self, current, values=None, finalize=None):
+        """
+        Updates the progress bar.
+
+        Args:
+            current (int): Index of current step.
+            values (list): List of tuples: `(name, value_for_last_step)`. 
+            If `name` is in `stateful_metrics`, `value_for_last_step` will be 
+            displayed as-is. Else, an average of the metric over time will be 
+            displayed.
+            finalize (bool): Whether this is the last update for the progress bar.
+            If `None`, defaults to `current >= self.target`.
+        """
+
+        if finalize is None:
+            if self.target is None:
+                finalize = False
+            else:
+                finalize = current >= self.target
+
+        values = values or []
+        for k, v in values:
+            if k not in self._values_order:
+                self._values_order.append(k)
+            if k not in self.stateful_metrics:
+                # In the case that progress bar doesn't have a target value in the first
+                # epoch, both on_batch_end and on_epoch_end will be called, which will
+                # cause 'current' and 'self._seen_so_far' to have the same value. Force
+                # the minimal value to 1 here, otherwise stateful_metric will be 0s.
+                value_base = max(current - self._seen_so_far, 1)
+                if k not in self._values:
+                    self._values[k] = [v * value_base, value_base]
+                else:
+                    self._values[k][0] += v * value_base
+                    self._values[k][1] += value_base
+            else:
+                # Stateful metrics output a numeric value. This representation
+                # means "take an average from a single value" but keeps the
+                # numeric formatting.
+                self._values[k] = [v, 1]
+        self._seen_so_far = current
+
+        now = time.time()
+        info = ' - %.0fs' % (now - self._start)
+        if self.verbose == 1:
+            if now - self._last_update < self.interval and not finalize:
+                return
+
+            prev_total_width = self._total_width
+            if self._dynamic_display:
+                sys.stderr.write('\b' * prev_total_width)
+                sys.stderr.write('\r')
+            else:
+                sys.stderr.write('\n')
+
+            if self.target is not None:
+                numdigits = int(np.log10(self.target)) + 1
+                bar = ('%' + str(numdigits) + 'd/%d [') % (current, self.target)
+                prog = float(current) / self.target
+                prog_width = int(self.width * prog)
+                if prog_width > 0:
+                    bar += ('=' * (prog_width - 1))
+                    if current < self.target:
+                        bar += '>'
+                    else:
+                        bar += '='
+                bar += ('.' * (self.width - prog_width))
+                bar += ']'
+            else:
+                bar = '%7d/Unknown' % current
+
+            self._total_width = len(bar)
+            sys.stderr.write(bar)
+
+            if current:
+                time_per_unit = (now - self._start) / current
+            else:
+                time_per_unit = 0
+
+            if self.target is None or finalize:
+                if time_per_unit >= 1 or time_per_unit == 0:
+                    info += ' %.0fs/%s' % (time_per_unit, self.unit_name)
+                elif time_per_unit >= 1e-3:
+                    info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name)
+                else:
+                    info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name)
+            else:
+                eta = time_per_unit * (self.target - current)
+                if eta > 3600:
+                    eta_format = '%d:%02d:%02d' % (eta // 3600,
+                                                   (eta % 3600) // 60, eta % 60)
+                elif eta > 60:
+                    eta_format = '%d:%02d' % (eta // 60, eta % 60)
+                else:
+                    eta_format = '%ds' % eta
+
+                info = ' - ETA: %s' % eta_format
+
+            for k in self._values_order:
+                info += ' - %s:' % k
+                if isinstance(self._values[k], list):
+                    avg = np.mean(
+                        self._values[k][0] / max(1, self._values[k][1]))
+                    if abs(avg) > 1e-3:
+                        info += ' %.4f' % avg
+                    else:
+                        info += ' %.4e' % avg
+                else:
+                    info += ' %s' % self._values[k]
+
+            self._total_width += len(info)
+            if prev_total_width > self._total_width:
+                info += (' ' * (prev_total_width - self._total_width))
+
+            if finalize:
+                info += '\n'
+
+            sys.stderr.write(info)
+            sys.stderr.flush()
+
+        elif self.verbose == 2:
+            if finalize:
+                numdigits = int(np.log10(self.target)) + 1
+                count = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
+                info = count + info
+                for k in self._values_order:
+                    info += ' - %s:' % k
+                    avg = np.mean(
+                        self._values[k][0] / max(1, self._values[k][1]))
+                    if avg > 1e-3:
+                        info += ' %.4f' % avg
+                    else:
+                        info += ' %.4e' % avg
+                info += '\n'
+
+                sys.stderr.write(info)
+                sys.stderr.flush()
+
+        self._last_update = now
+
+    def add(self, n, values=None):
+        self.update(self._seen_so_far + n, values)
diff --git a/semantic_segmentation/src/utils/timer.py b/semantic_segmentation/src/utils/timer.py
new file mode 100644
index 00000000..c4b3a7ec
--- /dev/null
+++ b/semantic_segmentation/src/utils/timer.py
@@ -0,0 +1,38 @@
+import time
+
+
+class TimeAverager(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self._cnt = 0
+        self._total_time = 0
+        self._total_samples = 0
+
+    def record(self, usetime, num_samples=None):
+        self._cnt += 1
+        self._total_time += usetime
+        if num_samples:
+            self._total_samples += num_samples
+
+    def get_average(self):
+        if self._cnt == 0:
+            return 0
+        return self._total_time / float(self._cnt)
+
+    def get_ips_average(self):
+        if not self._total_samples or self._cnt == 0:
+            return 0
+        return float(self._total_samples) / self._total_time
+
+def calculate_eta(remaining_step, speed):
+    if remaining_step < 0:
+        remaining_step = 0
+    remaining_time = int(remaining_step * speed)
+    result = "{:0>2}:{:0>2}:{:0>2}"
+    arr = []
+    for i in range(2, -1, -1):
+        arr.append(int(remaining_time / 60**i))
+        remaining_time %= 60**i
+    return result.format(*arr)
diff --git a/semantic_segmentation/src/utils/vis.py b/semantic_segmentation/src/utils/vis.py
new file mode 100644
index 00000000..2773307b
--- /dev/null
+++ b/semantic_segmentation/src/utils/vis.py
@@ -0,0 +1,84 @@
+import cv2
+import numpy as np
+
+def visualize(img_path, pred, weight=0.6):
+    """
+    Convert predict result to color image, and save added image.
+
+    Args:
+        img_path (str): The path of input image.
+        pred (np.ndarray): The predict result of segmentation model.
+        weight (float): The image weight of visual image, and the result weight 
+        is (1 - weight). Default: 0.6
+
+    Returns:
+        vis_result (np.ndarray): the visualized result.
+    """
+
+    color_map = get_pseudo_color_map(256)
+    color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
+    color_map = np.array(color_map).astype("uint8")
+    # Use OpenCV LUT for color mapping
+    c1 = cv2.LUT(pred, color_map[:, 0])
+    c2 = cv2.LUT(pred, color_map[:, 1])
+    c3 = cv2.LUT(pred, color_map[:, 2])
+    pseudo_img = np.dstack((c1, c2, c3))
+    img = cv2.imread(img_path)
+    vis_result = cv2.addWeighted(img, weight, pseudo_img, 1 - weight, 0)
+    return vis_result
+
+def get_cityscapes_color_map():
+    """
+    Get the color map of Cityscapes dataset 
+
+    Returns:
+        color_map (list): The color map of Cityscapes 
+    """
+    num_cls = 20
+    color_map = [0] * (num_cls * 3)
+    color_map[0:3] = (128, 64, 128)       # 0: 'road' 
+    color_map[3:6] = (244, 35,232)        # 1 'sidewalk'
+    color_map[6:9] = (70, 70, 70)         # 2''building'
+    color_map[9:12] = (102,102,156)       # 3 wall
+    color_map[12:15] =  (190,153,153)     # 4 fence
+    color_map[15:18] = (153,153,153)      # 5 pole
+    color_map[18:21] = (250,170, 30)      # 6 'traffic light'
+    color_map[21:24] = (220,220, 0)       # 7 'traffic sign'
+    color_map[24:27] = (107,142, 35)      # 8 'vegetation'
+    color_map[27:30] = (152,251,152)      # 9 'terrain'
+    color_map[30:33] = ( 70,130,180)      # 10 sky
+    color_map[33:36] = (220, 20, 60)      # 11 person
+    color_map[36:39] = (255, 0, 0)        # 12 rider
+    color_map[39:42] = (0, 0, 142)        # 13 car
+    color_map[42:45] = (0, 0, 70)         # 14 truck
+    color_map[45:48] = (0, 60,100)        # 15 bus
+    color_map[48:51] = (0, 80,100)        # 16 train
+    color_map[51:54] = (0, 0,230)         # 17 'motorcycle'
+    color_map[54:57] = (119, 11, 32)      # 18 'bicycle'
+    color_map[57:60] = (105, 105, 105)
+    return color_map
+
+def get_pseudo_color_map(num_classes=256):
+    """
+    Get the pseduo color map for visualizing the segmentation mask,
+
+    Args:
+        num_classes (int): Number of classes.
+
+    Returns:
+        colar_map (list): The color map.
+    """
+
+    num_classes += 1
+    color_map = num_classes * [0, 0, 0]
+    for i in range(0, num_classes):
+        j = 0
+        lab = i
+        while lab:
+            color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
+            color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
+            color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
+            j += 1
+            lab >>= 3
+    color_map = color_map[3:]
+    return color_map
diff --git a/semantic_segmentation/tools/convert_cityscapes.py b/semantic_segmentation/tools/convert_cityscapes.py
new file mode 100644
index 00000000..47a253eb
--- /dev/null
+++ b/semantic_segmentation/tools/convert_cityscapes.py
@@ -0,0 +1,49 @@
+import argparse
+import os.path as osp
+import mmcv
+from cityscapesscripts.preparation.json2labelImg import json2labelImg
+
+def convert_json_to_label(json_file):
+    label_file = json_file.replace('_polygons.json', '_labelTrainIds.png')
+    json2labelImg(json_file, label_file, 'trainIds')
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert Cityscapes annotations to TrainIds')
+    parser.add_argument('--cityscapes_path',
+        default='/home/ssd3/wutianyi/datasets/cityscapes', help='cityscapes data path')
+    parser.add_argument('--gt-dir', default='gtFine', type=str)
+    parser.add_argument('-o', '--out-dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=1, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    cityscapes_path = args.cityscapes_path
+    out_dir = args.out_dir if args.out_dir else cityscapes_path
+    mmcv.mkdir_or_exist(out_dir)
+    gt_dir = osp.join(cityscapes_path, args.gt_dir)
+    poly_files = []
+    for poly in mmcv.scandir(gt_dir, '_polygons.json', recursive=True):
+        poly_file = osp.join(gt_dir, poly)
+        poly_files.append(poly_file)
+    if args.nproc > 1:
+        mmcv.track_parallel_progress(convert_json_to_label, poly_files,
+                                     args.nproc)
+    else:
+        mmcv.track_progress(convert_json_to_label, poly_files)
+
+    split_names = ['train', 'val', 'test']
+    for split in split_names:
+        filenames = []
+        for poly in mmcv.scandir(
+                osp.join(gt_dir, split), '_polygons.json', recursive=True):
+            filenames.append(poly.replace('_gtFine_polygons.json', ''))
+        with open(osp.join(out_dir, f'{split}.txt'), 'w') as f:
+            f.writelines(f + '\n' for f in filenames)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/semantic_segmentation/tools/voc2010_to_pascalcontext.py b/semantic_segmentation/tools/voc2010_to_pascalcontext.py
new file mode 100644
index 00000000..57581556
--- /dev/null
+++ b/semantic_segmentation/tools/voc2010_to_pascalcontext.py
@@ -0,0 +1,85 @@
+import argparse
+import os.path as osp
+from functools import partial
+
+import mmcv
+import numpy as np
+from detail import Detail
+from PIL import Image
+
+_mapping = np.sort(
+    np.array([
+        0, 2, 259, 260, 415, 324, 9, 258, 144, 18, 19, 22, 23, 397, 25, 284,
+        158, 159, 416, 33, 162, 420, 454, 295, 296, 427, 44, 45, 46, 308, 59,
+        440, 445, 31, 232, 65, 354, 424, 68, 326, 72, 458, 34, 207, 80, 355,
+        85, 347, 220, 349, 360, 98, 187, 104, 105, 366, 189, 368, 113, 115
+    ]))
+_key = np.array(range(len(_mapping))).astype('uint8')
+
+
+def generate_labels(img_id, detail, out_dir):
+
+    def _class_to_index(mask, _mapping, _key):
+        # assert the values
+        values = np.unique(mask)
+        for i in range(len(values)):
+            assert (values[i] in _mapping)
+        index = np.digitize(mask.ravel(), _mapping, right=True)
+        return _key[index].reshape(mask.shape)
+
+    mask = Image.fromarray(
+        _class_to_index(detail.getMask(img_id), _mapping=_mapping, _key=_key))
+    filename = img_id['file_name']
+    mask.save(osp.join(out_dir, filename.replace('jpg', 'png')))
+    return osp.splitext(osp.basename(filename))[0]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert PASCAL VOC annotations to mmdetection format')
+    parser.add_argument('--devkit_path', default='./', help='pascal voc devkit path')
+    parser.add_argument('--json_path', default='./trainval_merged.json',help='annoation json filepath')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    devkit_path = args.devkit_path
+    if args.out_dir is None:
+        out_dir = osp.join(devkit_path, 'SegmentationClassContext')
+    else:
+        out_dir = args.out_dir
+    json_path = args.json_path
+    mmcv.mkdir_or_exist(out_dir)
+    img_dir = osp.join(devkit_path, 'JPEGImages')
+
+    train_detail = Detail(json_path, img_dir, 'train')
+    train_ids = train_detail.getImgs()
+
+    val_detail = Detail(json_path, img_dir, 'val')
+    val_ids = val_detail.getImgs()
+
+    mmcv.mkdir_or_exist(
+        osp.join(devkit_path, 'ImageSets/SegmentationContext'))
+
+    train_list = mmcv.track_progress(
+        partial(generate_labels, detail=train_detail, out_dir=out_dir),
+        train_ids)
+    with open(
+            osp.join(devkit_path, 'ImageSets/SegmentationContext', 'train.txt'), 'w') as f:
+        f.writelines(line + '\n' for line in sorted(train_list))
+
+    val_list = mmcv.track_progress(
+        partial(generate_labels, detail=val_detail, out_dir=out_dir), val_ids)
+    with open(
+            osp.join(devkit_path, 'ImageSets/SegmentationContext','val.txt'), 'w') as f:
+        f.writelines(line + '\n' for line in sorted(val_list))
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/semantic_segmentation/train.py b/semantic_segmentation/train.py
new file mode 100644
index 00000000..ccb12959
--- /dev/null
+++ b/semantic_segmentation/train.py
@@ -0,0 +1,217 @@
+#!/usr/bin/python3
+import os
+import time
+import random
+import argparse
+import numpy as np
+from collections import deque
+import paddle
+import paddle.nn as nn
+from config import *
+from src.utils import logger
+from src.datasets import get_dataset
+from src.models import get_model
+from src.transforms import *
+from src.utils import TimeAverager, calculate_eta, resume
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Visual Transformer for semantic segmentation')
+    parser.add_argument(
+        "--config", 
+        dest='cfg',
+        default=None, 
+        type=str,
+        help="The config file."
+    )
+    return parser.parse_args()
+
+def optimizer_setting(model, config):
+    if config.TRAIN.LR_SCHEDULER.NAME == "PolynomialDecay":
+        scheduler = paddle.optimizer.lr.PolynomialDecay(
+            learning_rate=config.TRAIN.BASE_LR, 
+            decay_steps=config.TRAIN.ITERS, 
+            end_lr=config.TRAIN.END_LR, 
+            power=config.TRAIN.POWER, 
+            cycle=False, 
+            last_epoch=-1, 
+            verbose=False)
+    else:
+        raise NotImplementedError(
+            f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM)
+    elif config.TRAIN.OPTIMIZER.NAME == "ADAM":
+        optimizer = paddle.optimizer.Adam(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            weight_decay=config.TRAIN.WEIGHT_DECAY)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip)
+    else:
+        raise NotImplementedError(
+            f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+    return optimizer
+
+def multi_cross_entropy_loss(pred_list, 
+                             label,
+                             num_classes=60, 
+                             weights=[1, 0.4, 0.4, 0.4, 0.4]):
+    label = paddle.reshape(label, [-1, 1]) # (b, h, w) -> (bhw, 1)                                      
+    label.stop_gradient = True
+    loss = 0
+    for i in range(len(pred_list)):
+        pred_i = paddle.transpose(pred_list[i], perm=[0, 2, 3, 1]) # (b,c,h,w) -> (b,h,w,c)
+        pred_i = paddle.reshape(pred_i, [-1, num_classes]) # (b,h,w,c) -> (bhw, c)
+        pred_i = nn.functional.softmax(pred_i, axis=1)  
+        loss_i = nn.functional.cross_entropy(pred_i, label, ignore_index=255)
+        loss += weights[i]*loss_i
+    return loss
+
+def main():
+    config = get_config()
+    args = parse_args()
+    config = update_config(config, args)
+    place = 'gpu' if config.TRAIN.USE_GPU else 'cpu'
+    paddle.set_device(place)
+    # build  model
+    model = get_model(config)
+    model.train()
+    nranks = paddle.distributed.ParallelEnv().nranks
+    local_rank = paddle.distributed.ParallelEnv().local_rank
+    # build optimizer
+    optimizer = optimizer_setting(model, config)
+    # build dataset_train
+    transforms_train = [ 
+        ResizeStepScaling(min_scale_factor=0.5, 
+                          max_scale_factor=2.0, 
+                          scale_step_size=0.25),
+        RandomPaddingCrop(crop_size=config.DATA.CROP_SIZE, 
+                          img_padding_value=(123.675, 116.28, 103.53), 
+                          label_padding_value=255),
+        RandomHorizontalFlip(prob=0.5),
+        RandomDistort(brightness_range=0.4, 
+                      contrast_range=0.4, 
+                      saturation_range=0.4),
+        Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+    ]
+    dataset_train = get_dataset(config, data_transform=transforms_train, mode='train')
+    batch_sampler = paddle.io.DistributedBatchSampler(
+        dataset_train, 
+        batch_size=config.DATA.BATCH_SIZE, 
+        shuffle=True, drop_last=True)
+    train_loader = paddle.io.DataLoader(
+        dataset_train,
+        batch_sampler=batch_sampler,
+        num_workers=config.DATA.NUM_WORKERS,
+        return_list=True,
+    )
+    logger.info("train_loader.len= {}".format(len(train_loader)))
+    start_iter = 0
+    # TODO(wutianyiRosun@gmail.com): Resume from checkpoints, and update start_iter
+
+    # build workspace for saving checkpoints
+    if not os.path.isdir(config.SAVE_DIR):
+        if os.path.exists(config.SAVE_DIR):
+            os.remove(config.SAVE_DIR)
+        os.makedirs(config.SAVE_DIR)
+    if nranks > 1:
+        # Initialize parallel environment if not done.
+        if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized():
+            logger.info("using dist training")
+            paddle.distributed.init_parallel_env()
+            ddp_model = paddle.DataParallel(model)
+        else:
+            ddp_model = paddle.DataParallel(model)
+    avg_loss = 0.0
+    avg_loss_list = []
+    iters_per_epoch = len(batch_sampler)
+    reader_cost_averager = TimeAverager()
+    batch_cost_averager = TimeAverager()
+    save_models = deque()
+    batch_start = time.time()
+    cur_iter = start_iter
+    # begin training
+    while cur_iter < config.TRAIN.ITERS:
+        for data in train_loader:
+            cur_iter += 1
+            if cur_iter > config.TRAIN.ITERS:
+                break
+            reader_cost_averager.record(time.time() - batch_start)
+            images = data[0]
+            labels = data[1].astype('int64')
+            if nranks > 1:
+                logits_list = ddp_model(images)
+            else:
+                logits_list = model(images)
+            loss_list = multi_cross_entropy_loss(logits_list, labels, num_classes=config.DATA.NUM_CLASSES)
+            loss = sum(loss_list)
+            loss.backward()
+            optimizer.step()
+            lr = optimizer.get_lr()
+            if isinstance(optimizer._learning_rate,paddle.optimizer.lr.LRScheduler):
+                optimizer._learning_rate.step()
+            model.clear_gradients()
+            avg_loss += loss.numpy()[0]
+            if not avg_loss_list:
+                avg_loss_list = [l.numpy() for l in loss_list]
+            else:
+                for i in range(len(loss_list)):
+                    avg_loss_list[i] += loss_list[i].numpy()
+            batch_cost_averager.record(
+                time.time() - batch_start, num_samples=config.DATA.BATCH_SIZE)
+            if (cur_iter) % config.LOGGING_INFO_FREQ == 0 and local_rank == 0:
+                avg_loss /= config.LOGGING_INFO_FREQ
+                avg_loss_list = [l[0] / config.LOGGING_INFO_FREQ for l in avg_loss_list]
+                remain_iters = config.TRAIN.ITERS - cur_iter
+                avg_train_batch_cost = batch_cost_averager.get_average()
+                avg_train_reader_cost = reader_cost_averager.get_average()
+                eta = calculate_eta(remain_iters, avg_train_batch_cost)
+                logger.info("[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, lr: {:.8f}, batch_cost:\
+                    {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}".format(
+                    (cur_iter - 1) // iters_per_epoch + 1, cur_iter, config.TRAIN.ITERS, avg_loss, 
+                    lr, avg_train_batch_cost, avg_train_reader_cost, 
+                    batch_cost_averager.get_ips_average(), eta))
+                avg_loss = 0.0
+                avg_loss_list = []
+                reader_cost_averager.reset()
+                batch_cost_averager.reset()
+
+            if (cur_iter % config.SAVE_FREQ_CHECKPOINT == 0 or cur_iter == config.TRAIN.ITERS) and local_rank == 0:
+                current_save_weigth_file = os.path.join(config.SAVE_DIR,
+                    "iter_{}_model_state.pdparams".format(cur_iter))
+                current_save_opt_file = os.path.join(config.SAVE_DIR,
+                    "iter_{}_opt_state.pdopt".format(cur_iter))
+                paddle.save(model.state_dict(), current_save_weigth_file)
+                paddle.save(optimizer.state_dict(), current_save_opt_file)
+                save_models.append([current_save_weigth_file,
+                                    current_save_opt_file])
+                logger.info("saving the weights of model to {}".format(
+                    current_save_weigth_file))
+                if len(save_models) > config.KEEP_CHECKPOINT_MAX > 0:
+                    files_to_remove = save_models.popleft()
+                    os.remove(files_to_remove[0])
+                    os.remove(files_to_remove[1])
+            batch_start = time.time()
+    time.sleep(1.0)
+
+if __name__ == '__main__':
+    main()
diff --git a/semantic_segmentation/tutorial/custom_dataset.md b/semantic_segmentation/tutorial/custom_dataset.md
new file mode 100644
index 00000000..60b71c47
--- /dev/null
+++ b/semantic_segmentation/tutorial/custom_dataset.md
@@ -0,0 +1,57 @@
+# Custom DataSet
+
+In order to train on your own dataset, you should do the following three steps:
+
+## Data Preparation
+
+### Data Structure
+
+To be simple, you should reorganize your dataset as follow. 
+
+```none
+├── your_dataset
+│   ├── images
+│   │   ├── training
+│   │   │   ├── xxx{img_suffix}
+│   │   │   ├── yyy{img_suffix}
+│   │   │   ├── zzz{img_suffix}
+│   │   ├── validation
+│   ├── annotations
+│   │   ├── training
+│   │   │   ├── xxx{seg_map_suffix}
+│   │   │   ├── yyy{seg_map_suffix}
+│   │   │   ├── zzz{seg_map_suffix}
+│   │   ├── validation
+
+```
+
+Images and labels are stored separately, and are part into training and testing set. The above four directory path will be specific in the script, in a relative way to the dataset root path, so you can rename them as you like.
+
+### Annotations Format
+
+Only support for gray-scale image now, you should always transform your image_label into gray-scale image first if needed.
+
+The pixel intensity means the class index, and you can set a ignore_index which doesn't attend the computation of metric.
+
+## Write A New Script 
+
+1. copy a existed dataset script in src/datasets, and replace the origin dataset name to your dataset name. 
+2. change the num_classes default parameter.
+3. override the init function to specific some parameters especially self.file_list, which is a list of Image/Label path correspondence
+   + ADE20K  scans **all files** in the image_dir, and replace img_suffix  to seg_map_suffix in the filename.
+   + Cityscapes scans files with the given suffix, and get the correspondence in a sorted way.
+   + You can refer to the two method above to create your dataset Image/Label correspondence. Both method need to modify the img_suffix and seg_map_suffix  in the correct place.
+
+​	4. In the __init__.py, add your own dataset into if-elif structure in the get_dataset function.
+
+## Create Yaml Config 
+
+1. copy a existed yaml config
+
+2. change the following parameters in Data :DATASET, DATA_PATH，NUM_CLASSES
+
+3. change parameters as you need. To do this, you'd better have a good understanding about the meaning of parameters  in config.
+
+   
+
+Now you can use the command to train on your own dataset.
\ No newline at end of file
diff --git a/semantic_segmentation/val.py b/semantic_segmentation/val.py
new file mode 100644
index 00000000..33b29451
--- /dev/null
+++ b/semantic_segmentation/val.py
@@ -0,0 +1,164 @@
+#!/usr/bin/python3
+import time
+import shutil
+import random
+import argparse
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from config import *
+from src.api import infer
+from src.datasets import get_dataset
+from src.transforms import Resize, Normalize 
+from src.models import get_model
+from src.utils import metrics, logger, progbar
+from src.utils import TimeAverager, calculate_eta
+from src.utils import load_entire_model, resume
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Evaluation of Seg. Models')
+    parser.add_argument(
+        "--config", 
+        dest='cfg', 
+        default=None, 
+        type=str, 
+        help='The config file.'
+    )
+    parser.add_argument(
+        '--model_path', 
+        dest='model_path', 
+        help='The path of weights file (segmentation model)', 
+        type=str, 
+        default=None
+    )
+    parser.add_argument(
+        "--multi_scales", 
+        type=bool, 
+        default=False, 
+        help='whether employing multiple scales testing'
+    )
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    config = get_config()
+    args = parse_args()
+    config = update_config(config, args)
+    if args.model_path is None:
+        args.model_path = os.path.join(config.SAVE_DIR,
+            "iter_{}_model_state.pdparams".format(config.TRAIN.ITERS))
+    place = 'gpu' if config.VAL.USE_GPU else 'cpu'
+    paddle.set_device(place)
+    # build model
+    model = get_model(config)
+    if args.model_path:
+        load_entire_model(model, args.model_path)
+        logger.info('Loaded trained params of model successfully')
+    model.eval()
+
+    nranks = paddle.distributed.ParallelEnv().nranks
+    local_rank = paddle.distributed.ParallelEnv().local_rank
+    if nranks > 1:
+        # Initialize parallel environment if not done.
+        if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized():
+            paddle.distributed.init_parallel_env()
+            ddp_model = paddle.DataParallel(model)
+        else:
+            ddp_model = paddle.DataParallel(model)
+    # build val dataset and dataloader
+    transforms_val = [ Resize(target_size=config.VAL.IMAGE_BASE_SIZE,
+                              keep_ori_size=config.VAL.KEEP_ORI_SIZE),
+                       Normalize(mean=config.VAL.MEAN, std=config.VAL.STD)]
+    dataset_val = get_dataset(config, data_transform=transforms_val, mode='val')
+    batch_sampler = paddle.io.DistributedBatchSampler(dataset_val, 
+        batch_size=config.DATA.BATCH_SIZE_VAL, shuffle=True, drop_last=True)
+    loader_val = paddle.io.DataLoader(dataset_val, batch_sampler=batch_sampler,
+        num_workers=config.DATA.NUM_WORKERS, return_list=True)
+    total_iters = len(loader_val)
+    # build workspace for saving checkpoints
+    if not os.path.isdir(config.SAVE_DIR):
+        if os.path.exists(config.SAVE_DIR):
+            os.remove(config.SAVE_DIR)
+        os.makedirs(config.SAVE_DIR)
+    intersect_area_all = 0
+    pred_area_all = 0
+    label_area_all = 0
+    logger.info("Start evaluating (total_samples: {}, total_iters: {}, "
+        "multi-scale testing: {})".format(len(dataset_val), total_iters, args.multi_scales))
+    progbar_val = progbar.Progbar(target=total_iters, verbose=1)
+    reader_cost_averager = TimeAverager()
+    batch_cost_averager = TimeAverager()
+    batch_start = time.time()
+    with paddle.no_grad():
+        for iter, (img, label) in enumerate(loader_val):
+            reader_cost_averager.record(time.time() - batch_start)
+            label = label.astype('int64')
+            #print("img.shape: {}, label.shape: {}".format(img.shape, label.shape))
+            ori_shape = label.shape[-2:]
+            if args.multi_scales == True:
+                pred = infer.ms_inference(
+                    model=model,
+                    img=img,
+                    ori_shape=ori_shape,
+                    is_slide=True,
+                    base_size=config.VAL.IMAGE_BASE_SIZE,
+                    stride_size=config.VAL.STRIDE_SIZE,
+                    crop_size=config.VAL.CROP_SIZE,
+                    num_classes=config.DATA.NUM_CLASSES,
+                    scales=config.VAL.SCALE_RATIOS,
+                    flip_horizontal=True,  
+                    flip_vertical=False,
+                    rescale_from_ori=config.VAL.RESCALE_FROM_ORI)
+            else:
+                pred = infer.ss_inference(
+                    model=model,
+                    img=img,
+                    ori_shape=ori_shape,
+                    is_slide=True,
+                    base_size=config.VAL.IMAGE_BASE_SIZE,
+                    stride_size=config.VAL.STRIDE_SIZE,
+                    crop_size=config.VAL.CROP_SIZE,
+                    num_classes=config.DATA.NUM_CLASSES,
+                    rescale_from_ori=config.VAL.RESCALE_FROM_ORI)
+
+            intersect_area, pred_area, label_area = metrics.calculate_area(
+                pred,
+                label,
+                dataset_val.num_classes,
+                ignore_index=dataset_val.ignore_index)
+            # Gather from all ranks
+            if nranks > 1:
+                intersect_area_list = []
+                pred_area_list = []
+                label_area_list = []
+                paddle.distributed.all_gather(intersect_area_list, intersect_area)
+                paddle.distributed.all_gather(pred_area_list, pred_area)
+                paddle.distributed.all_gather(label_area_list, label_area)
+                # Some image has been evaluated and should be eliminated in last iter
+                if (iter + 1) * nranks > len(dataset_val):
+                    valid = len(dataset_val) - iter * nranks
+                    intersect_area_list = intersect_area_list[:valid]
+                    pred_area_list = pred_area_list[:valid]
+                    label_area_list = label_area_list[:valid]
+                for i in range(len(intersect_area_list)):
+                    intersect_area_all = intersect_area_all + intersect_area_list[i]
+                    pred_area_all = pred_area_all + pred_area_list[i]
+                    label_area_all = label_area_all + label_area_list[i]
+            else:
+                intersect_area_all = intersect_area_all + intersect_area
+                pred_area_all = pred_area_all + pred_area
+                label_area_all = label_area_all + label_area
+            batch_cost_averager.record(time.time() - batch_start, num_samples=len(label))
+            batch_cost = batch_cost_averager.get_average()
+            reader_cost = reader_cost_averager.get_average()
+            if local_rank == 0 :
+                progbar_val.update(iter + 1, [('batch_cost', batch_cost), ('reader cost', reader_cost)])
+            reader_cost_averager.reset()
+            batch_cost_averager.reset()
+            batch_start = time.time()
+    class_iou, miou = metrics.mean_iou(intersect_area_all, pred_area_all, label_area_all)
+    class_acc, acc = metrics.accuracy(intersect_area_all, pred_area_all)
+    kappa = metrics.kappa(intersect_area_all, pred_area_all, label_area_all)
+    logger.info("[EVAL] #Images: {} mIoU: {:.4f} Acc: {:.4f} Kappa: {:.4f} ".format(len(dataset_val), miou, acc, kappa))
+    logger.info("[EVAL] Class IoU: \n" + str(np.round(class_iou, 4)))
+    logger.info("[EVAL] Class Acc: \n" + str(np.round(class_acc, 4)))