From 603e0b8a6835aa22d9e92990c38307dc3c2f524d Mon Sep 17 00:00:00 2001
From: ceci3 <ceci3@users.noreply.github.com>
Date: Tue, 19 Dec 2023 18:30:32 +0800
Subject: [PATCH 1/4] support nf4 channel wise quant & fix bug when
 blocksize>512 (#1817)

---
 csrc/lc/dequantize_blockwise.cu |  84 ++++++++++++++++++++---
 csrc/lc/quantize_blockwise.cu   | 115 ++++++++++++++++++++++++--------
 2 files changed, 162 insertions(+), 37 deletions(-)
diff --git a/csrc/lc/dequantize_blockwise.cu b/csrc/lc/dequantize_blockwise.cu
index 8046c34ac..0bf76a163 100644
--- a/csrc/lc/dequantize_blockwise.cu
+++ b/csrc/lc/dequantize_blockwise.cu
@@ -201,7 +201,6 @@ template __global__ void kDequantizeBlockwise<float, 512, 64, 8, NF4>(const floa
 //template __global__ void kDequantizeBlockwise<__nv_bfloat16, 512, 64, 8, NF4>(const float *code, const unsigned char * A, const float * absmax, __nv_bfloat16 *out, int blocksize, int n);
 
 
-
 template<typename T, int DATA_TYPE> void dequantize_blockwise(const float *code, const unsigned char *A, const float *absmax, T *out, int blocksize, int n)
 {
   int num_blocks = n/blocksize;
@@ -226,6 +225,50 @@ template void dequantize_blockwise<float, NF4>(const float *code, const unsigned
 //template void dequantize_blockwise<__nv_bfloat16, FP4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n);
 //template void dequantize_blockwise<__nv_bfloat16, NF4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n);
 
+template <typename T, int DATA_TYPE>
+__global__ void kDequantizeChannelwise(const unsigned char* A,
+                                      const float *absmax,
+                                      float *out,
+                                      int n,
+                                      int cout) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  int num = n / 2;
+  //int part_n = num / cout;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    float local_absmax = absmax[i%cout];
+    int idx = 2*(i/cout)* cout + i%cout;
+    switch(DATA_TYPE)
+    {
+        case FP4:
+            out[i*2 + i%cout] = dDequantizeFP4Tree(A[i] >> 4, local_absmax);
+            out[i*2 + cout + i%cout] = dDequantizeFP4Tree(A[i] & 0x0F, local_absmax);
+            break;
+        case NF4:
+            out[idx] = dDequantizeNF4(A[i] >> 4)* local_absmax;
+            out[idx + cout] = dDequantizeNF4(A[i] & 0x0F)* local_absmax;
+            break;
+    }
+    __syncthreads();
+  }
+}
+
+template<typename T, int DATA_TYPE> void dequantize_channelwise(const unsigned char *A, const float *absmax, T *out, int n, int cout)
+{
+  int max_threads = 1024; 
+  int64_t block_size =
+      std::min(static_cast<int64_t>(n),
+               static_cast<int64_t>(max_threads/ 4));
+
+  const int64_t max_blocks =
+      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+  const int64_t grid_size =
+      std::min(max_blocks, (n + block_size - 1) / block_size);
+
+  kDequantizeChannelwise<T, DATA_TYPE><<<grid_size, block_size>>>(A, absmax, out, n, cout);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
 std::vector<paddle::Tensor> DequantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, const paddle::Tensor& absmax, int blocksize, std::string quant_type) {
     int64_t input_numel = input.numel();
     int n = input_numel;
@@ -234,23 +277,44 @@ std::vector<paddle::Tensor> DequantizeBlockwise(const paddle::Tensor& input, con
         out_shape = {input_numel * 2, 1};
         n = n * 2;
     }
+    if (blocksize == -1) {
+        out_shape = {input.shape()[0] * 2, input.shape()[1]};
+    }
     auto out = paddle::empty(out_shape, paddle::DataType::FLOAT32, input.place());
 
-    if (quant_type == "8bit")
-        dequantize_blockwise<float, General8bit>(code.data<float>(), input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
-    else if (quant_type == "nf4")
-        dequantize_blockwise<float, NF4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
-    else if (quant_type == "fp4")
-        dequantize_blockwise<float, FP4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
-    else
-        PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. ");
+    if (blocksize == -1) {
+        if (quant_type == "8bit")
+            PD_THROW("blocksize is -1 only support NF4 and FP4.");
+        else
+            blocksize = n / absmax.numel() * 2;
+
+        int cout = input.shape()[1];
+        if (quant_type == "nf4")
+            dequantize_channelwise<float, NF4>(input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), n, cout);
+        else if (quant_type == "fp4")
+            dequantize_channelwise<float, FP4>(input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), n, cout);
+        else
+            PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. ");
+    } else {
+      if (quant_type == "8bit")
+          dequantize_blockwise<float, General8bit>(code.data<float>(), input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
+      else if (quant_type == "nf4")
+          dequantize_blockwise<float, NF4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
+      else if (quant_type == "fp4")
+          dequantize_blockwise<float, FP4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
+      else
+          PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. ");
+    }
     return {out};
 };
 
 std::vector<std::vector<int64_t>> GetDequantizeBlockwiseInferShape(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& code_shape, const std::vector<int64_t>& abs_max_shape, int blocksize, std::string quant_type){
     int64_t first_shape = input_shape[0] * input_shape[1] * 2;
     if (quant_type != "8bit")
-        return {{first_shape, 1}};
+        if (blocksize != -1)
+            return {{first_shape, 1}};
+        else
+            return {{input_shape[0] * 2, input_shape[1]}};
     else
         return {input_shape};
 }
diff --git a/csrc/lc/quantize_blockwise.cu b/csrc/lc/quantize_blockwise.cu
index d4f6ff2ca..e8e55b9d8 100644
--- a/csrc/lc/quantize_blockwise.cu
+++ b/csrc/lc/quantize_blockwise.cu
@@ -279,6 +279,7 @@ __global__ void kQuantizeBlockwise(const float * code, const T * __restrict__ A,
             #pragma unroll NUM_PER_TH
             for(int j = 0; j < NUM_PER_TH/2; j++)
             {
+              packed_4bit = 0;
               packed_4bit |= dQuantizeNF4(((float)vals[2*j])*local_abs_max) << 4;
               packed_4bit |= dQuantizeNF4(((float)vals[2*j+1])*local_abs_max);
               qvals[j] = packed_4bit;
@@ -360,9 +361,39 @@ MAKE_kQuantizeBlockwise(__nv_bfloat16,  256, 2, NF4)
 MAKE_kQuantizeBlockwise(__nv_bfloat16,  128, 2, NF4)
 MAKE_kQuantizeBlockwise(__nv_bfloat16,   64, 2, NF4)
 
+template <typename T, int DATA_TYPE>
+__global__ void kQuantizeChannelwise(const float *code,
+                                      const T* A,
+                                      unsigned char* out,
+                                      float *absmax,
+                                      int n,
+                                      int cout) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  int num = n / 2;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    int idx = 2*(i/cout)* cout + i%cout;
+    float local_absmax = absmax[i %cout];
+    float inv_local_absmax = 1.0f/local_absmax;
 
+    unsigned char packed_4bit = 0;
+    switch(DATA_TYPE)
+    {
+        case FP4:
+            packed_4bit |= dQuantizeFP4(((float)A[idx])*inv_local_absmax) << 4;
+            packed_4bit |= dQuantizeFP4(((float)A[idx+cout])*inv_local_absmax);
+            out[i] = packed_4bit;
+            break;
+        case NF4:
+            packed_4bit |= dQuantizeNF4(((float)A[idx])*inv_local_absmax) << 4;
+            packed_4bit |= dQuantizeNF4(((float)A[idx+cout])*inv_local_absmax);
+            out[i] = packed_4bit;
+            break;
+    }
+  }
+}
 
-template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float *code, const paddle::Tensor& A, float *absmax, unsigned char *out, int blocksize, int n)
+template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float *code, const paddle::Tensor& A, paddle::Tensor& absmax, unsigned char *out, int blocksize, int n, int channelwise)
 {
   typedef PDTraits<D> traits_;
   typedef typename traits_::DataType DataType_;
@@ -372,22 +403,43 @@ template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float
   num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
 
   const DataType_* A_data = reinterpret_cast<const DataType_*>(A.data<data_t>()); 
-  if(blocksize == 4096)
-    kQuantizeBlockwise<DataType_, 4096, 4, 0><<<num_blocks, 1024>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 2048)
-    kQuantizeBlockwise<DataType_, 2048, 4, DATA_TYPE><<<num_blocks, 512>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 1024)
-    kQuantizeBlockwise<DataType_, 1024, 4, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 512)
-    kQuantizeBlockwise<DataType_, 512, 2, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 256)
-    kQuantizeBlockwise<DataType_, 256, 2, DATA_TYPE><<<num_blocks, 128>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 128)
-    kQuantizeBlockwise<DataType_, 128, 2, DATA_TYPE><<<num_blocks, 64>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 64)
-    kQuantizeBlockwise<DataType_, 64, 2, DATA_TYPE><<<num_blocks, 32>>>(code, A_data, absmax, out, n);
-  else
-    PD_THROW("only support blocksize is [64, 128, 256, 512, 1024, 2048, 4096].");
+  if (channelwise == 0) {
+    if(blocksize == 4096)
+      kQuantizeBlockwise<DataType_, 4096, 4, 0><<<num_blocks, 1024>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 2048)
+      kQuantizeBlockwise<DataType_, 2048, 4, DATA_TYPE><<<num_blocks, 512>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 1024)
+      kQuantizeBlockwise<DataType_, 1024, 4, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 512)
+      kQuantizeBlockwise<DataType_, 512, 2, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 256)
+      kQuantizeBlockwise<DataType_, 256, 2, DATA_TYPE><<<num_blocks, 128>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 128)
+      kQuantizeBlockwise<DataType_, 128, 2, DATA_TYPE><<<num_blocks, 64>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 64)
+      kQuantizeBlockwise<DataType_, 64, 2, DATA_TYPE><<<num_blocks, 32>>>(code, A_data, absmax.data<float>(), out, n);
+  }
+  else {
+    if (DATA_TYPE == General8bit)
+        PD_THROW("blocksize is -1 only support NF4 and FP4.");
+
+    int cout = A.shape()[1];
+    int max_threads = 1024; 
+
+    absmax = A.abs().max({0});
+
+    int64_t block_size =
+        std::min(static_cast<int64_t>(n),
+                 static_cast<int64_t>(max_threads/ 4));
+
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (n + block_size - 1) / block_size);
+
+    kQuantizeChannelwise<DataType_, DATA_TYPE><<<grid_size, block_size, 0>>>(
+      code, A_data, out, absmax.data<float>(), n, cout);
+  }
 
 
   CUDA_CHECK_RETURN(cudaPeekAtLastError());
@@ -395,38 +447,44 @@ template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float
 
 std::vector<paddle::Tensor> QuantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, int blocksize, std::string quant_type) {
     int n = input.numel();
+    int channelwise = 0;
     std::vector<int64_t> out_shape = input.shape();
     if (quant_type != "8bit") { // 4bit
         out_shape = {(n + 1) / 2, 1};
     }
+    if (blocksize == -1){
+        blocksize = input.shape()[0];
+        out_shape = {input.shape()[0]/2, input.shape()[1]};
+        channelwise = 1;
+    }
     auto out = paddle::empty(out_shape, paddle::DataType::UINT8, input.place());
     int64_t absmax_shape = n / blocksize;
     auto absmax = paddle::empty({absmax_shape}, paddle::DataType::FLOAT32, input.place());
     switch(input.type()) {
         case paddle::DataType::FLOAT32:
             if (quant_type == "8bit")
-                quantize_blockwise<paddle::DataType::FLOAT32, General8bit>(code.data<float>(), input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT32, General8bit>(code.data<float>(), input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "nf4") {
-                quantize_blockwise<paddle::DataType::FLOAT32, NF4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT32, NF4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             }
             else if (quant_type == "fp4")
-                quantize_blockwise<paddle::DataType::FLOAT32, FP4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT32, FP4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             return {out, absmax};
         case paddle::DataType::FLOAT16:
             if (quant_type == "8bit")
-                quantize_blockwise<paddle::DataType::FLOAT16, General8bit>(code.data<float>(), input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT16, General8bit>(code.data<float>(), input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "nf4")
-                quantize_blockwise<paddle::DataType::FLOAT16, NF4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT16, NF4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "fp4")
-                quantize_blockwise<paddle::DataType::FLOAT16, FP4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT16, FP4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             return {out, absmax};
         case paddle::DataType::BFLOAT16:
             if (quant_type == "8bit")
-                quantize_blockwise<paddle::DataType::BFLOAT16, General8bit>(code.data<float>(), input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::BFLOAT16, General8bit>(code.data<float>(), input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "nf4")
-                quantize_blockwise<paddle::DataType::BFLOAT16, NF4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::BFLOAT16, NF4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "fp4")
-                quantize_blockwise<paddle::DataType::BFLOAT16, FP4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::BFLOAT16, FP4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             return {out, absmax};
 
         default:
@@ -440,7 +498,10 @@ std::vector<paddle::Tensor> QuantizeBlockwise(const paddle::Tensor& input, const
 std::vector<std::vector<int64_t>> GetQuantizeBlockwiseInferShape(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& code_shape, int blocksize, std::string quant_type){
     int64_t first_shape = (input_shape[0] * input_shape[1] + 1) / 2;
     if (quant_type != "8bit")
-        return {{first_shape, 1}};
+        if (blocksize != -1)
+          return {{first_shape, 1}};
+        else 
+          return {{input_shape[0]/2, input_shape[1]}};
     else
         return {input_shape};
 }

From a6323eec219fcdc92bf424acd1927729a614ba4f Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Wed, 3 Jan 2024 10:51:20 +0000
Subject: [PATCH 2/4] =?UTF-8?q?=E7=9B=AE=E6=A0=87=E6=A3=80=E6=B5=8B?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E8=87=AA=E5=8A=A8=E5=8E=8B=E7=BC=A9=E7=A4=BA?=
 =?UTF-8?q?=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../detection/configs/picodet_reader.yml      |  4 +--
 .../detection/configs/ppyoloe_l_qat_dis.yaml  |  5 ++-
 .../detection/paddle_inference_eval.py        | 33 ++++++++++++++-----
 .../detection/post_process.py                 | 14 ++++----
 4 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/example/auto_compression/detection/configs/picodet_reader.yml b/example/auto_compression/detection/configs/picodet_reader.yml
index 389673367..cd7ba8029 100644
--- a/example/auto_compression/detection/configs/picodet_reader.yml
+++ b/example/auto_compression/detection/configs/picodet_reader.yml
@@ -6,13 +6,13 @@ TrainDataset:
   !COCODataSet
     image_dir: train2017
     anno_path: annotations/instances_train2017.json
-    dataset_dir: dataset/coco/
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 EvalDataset:
   !COCODataSet
     image_dir: val2017
     anno_path: annotations/instances_val2017.json
-    dataset_dir: dataset/coco/
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 eval_height: &eval_height 416
 eval_width: &eval_width 416
diff --git a/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml b/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml
index 7102142ed..248a5b8e2 100644
--- a/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml
+++ b/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml
@@ -2,7 +2,7 @@
 Global:
   reader_config: configs/yolo_reader.yml
   arch: PPYOLOE
-  include_nms: True
+  include_nms: False
   Evaluation: True
   model_dir: ./ppyoloe_crn_l_300e_coco
   model_filename: model.pdmodel
@@ -30,5 +30,4 @@ TrainConfig:
   optimizer_builder:
     optimizer: 
       type: SGD
-    weight_decay: 4.0e-05
-
+    weight_decay: 4.0e-05
\ No newline at end of file
diff --git a/example/auto_compression/detection/paddle_inference_eval.py b/example/auto_compression/detection/paddle_inference_eval.py
index d2e12afd1..b0368bffb 100644
--- a/example/auto_compression/detection/paddle_inference_eval.py
+++ b/example/auto_compression/detection/paddle_inference_eval.py
@@ -18,6 +18,7 @@
 import sys
 import cv2
 import numpy as np
+from tqdm import tqdm
 
 import paddle
 from paddle.inference import Config
@@ -82,9 +83,15 @@ def argsparser():
     parser.add_argument("--img_shape", type=int, default=640, help="input_size")
     parser.add_argument(
         '--include_nms',
-        type=bool,
-        default=True,
+        type=str,
+        default='True',
         help="Whether include nms or not.")
+    parser.add_argument(
+        "--trt_calib_mode",
+        type=bool,
+        default=False,
+        help="If the model is produced by TRT offline quantitative "
+        "calibration, trt_calib_mode need to set True.")
 
     return parser
 
@@ -208,8 +215,9 @@ def load_predictor(
         use_mkldnn=False,
         batch_size=1,
         device="CPU",
-        min_subgraph_size=3,
+        min_subgraph_size=4,
         use_dynamic_shape=False,
+        trt_calib_mode=False,
         trt_min_shape=1,
         trt_max_shape=1280,
         trt_opt_shape=640,
@@ -238,9 +246,11 @@ def load_predictor(
     config = Config(
         os.path.join(model_dir, "model.pdmodel"),
         os.path.join(model_dir, "model.pdiparams"))
+
+    config.enable_memory_optim()
     if device == "GPU":
         # initial GPU memory(M), device ID
-        config.enable_use_gpu(200, 0)
+        config.enable_use_gpu(1000, 0)
         # optimize graph and fuse op
         config.switch_ir_optim(True)
     else:
@@ -260,12 +270,12 @@ def load_predictor(
     }
     if precision in precision_map.keys() and use_trt:
         config.enable_tensorrt_engine(
-            workspace_size=(1 << 25) * batch_size,
+            workspace_size=(1 << 30) * batch_size,
             max_batch_size=batch_size,
             min_subgraph_size=min_subgraph_size,
             precision_mode=precision_map[precision],
             use_static=True,
-            use_calib_mode=False, )
+            use_calib_mode=False)
 
         if use_dynamic_shape:
             dynamic_shape_file = os.path.join(FLAGS.model_path,
@@ -297,6 +307,7 @@ def predict_image(predictor,
     img, scale_factor = image_preprocess(image_file, image_shape)
     inputs = {}
     inputs["image"] = img
+
     if FLAGS.include_nms:
         inputs['scale_factor'] = scale_factor
     input_names = predictor.get_input_names()
@@ -356,7 +367,8 @@ def eval(predictor, val_loader, metric, rerun_flag=False):
     boxes_tensor = predictor.get_output_handle(output_names[0])
     if FLAGS.include_nms:
         boxes_num = predictor.get_output_handle(output_names[1])
-    for batch_id, data in enumerate(val_loader):
+    for batch_id, data in tqdm(
+            enumerate(val_loader), total=len(val_loader), desc='Evaluating'):
         data_all = {k: np.array(v) for k, v in data.items()}
         for i, _ in enumerate(input_names):
             input_tensor = predictor.get_input_handle(input_names[i])
@@ -382,7 +394,6 @@ def eval(predictor, val_loader, metric, rerun_flag=False):
             res = {'bbox': np_boxes, 'bbox_num': np_boxes_num}
         metric.update(data_all, res)
         if batch_id % 100 == 0:
-            print("Eval iter:", batch_id)
             sys.stdout.flush()
     metric.accumulate()
     metric.log()
@@ -421,7 +432,6 @@ def main():
             repeats=repeats)
     else:
         reader_cfg = load_config(FLAGS.reader_config)
-
         dataset = reader_cfg["EvalDataset"]
         global val_loader
         val_loader = create("EvalReader")(
@@ -432,6 +442,7 @@ def main():
         anno_file = dataset.get_anno()
         metric = COCOMetric(
             anno_file=anno_file, clsid2catid=clsid2catid, IouType="bbox")
+
         eval(predictor, val_loader, metric, rerun_flag=rerun_flag)
 
     if rerun_flag:
@@ -444,6 +455,10 @@ def main():
     paddle.enable_static()
     parser = argsparser()
     FLAGS = parser.parse_args()
+    if FLAGS.include_nms == 'True':
+        FLAGS.include_nms = True
+    else:
+        FLAGS.include_nms = False
 
     # DataLoader need run on cpu
     paddle.set_device("cpu")
diff --git a/example/auto_compression/detection/post_process.py b/example/auto_compression/detection/post_process.py
index eea2f0195..4ed79ce73 100644
--- a/example/auto_compression/detection/post_process.py
+++ b/example/auto_compression/detection/post_process.py
@@ -41,8 +41,7 @@ def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
         rest_boxes = boxes[indexes, :]
         iou = iou_of(
             rest_boxes,
-            np.expand_dims(
-                current_box, axis=0), )
+            np.expand_dims(current_box, axis=0), )
         indexes = indexes[iou <= iou_threshold]
 
     return box_scores[picked, :]
@@ -122,7 +121,7 @@ def _non_max_suppression(self, prediction, scale_factor):
                 picked_labels.extend([class_index] * box_probs.shape[0])
 
             if len(picked_box_probs) == 0:
-                out_boxes_list.append(np.empty((0, 4)))
+                out_boxes_list.append(np.empty((0, 6)))
 
             else:
                 picked_box_probs = np.concatenate(picked_box_probs)
@@ -135,9 +134,8 @@ def _non_max_suppression(self, prediction, scale_factor):
                 # clas score box
                 out_box = np.concatenate(
                     [
-                        np.expand_dims(
-                            np.array(picked_labels), axis=-1), np.expand_dims(
-                                picked_box_probs[:, 4], axis=-1),
+                        np.expand_dims(np.array(picked_labels), axis=-1),
+                        np.expand_dims(picked_box_probs[:, 4], axis=-1),
                         picked_box_probs[:, :4]
                     ],
                     axis=1)
@@ -152,6 +150,6 @@ def _non_max_suppression(self, prediction, scale_factor):
         return out_boxes_list, box_num_list
 
     def __call__(self, outs, scale_factor):
-        out_boxes_list, box_num_list = self._non_max_suppression(outs,
-                                                                 scale_factor)
+        out_boxes_list, box_num_list = self._non_max_suppression(
+            outs, scale_factor)
         return {'bbox': out_boxes_list, 'bbox_num': box_num_list}

From 132c5b6f5f304b07c9f2fbdcbeebf1b8c3c19ff8 Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 8 Jan 2024 09:30:42 +0000
Subject: [PATCH 3/4] =?UTF-8?q?paddle=5Finference.py=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../paddle_inference_eval.py                    | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py b/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py
index a1df31b78..ea5cb975d 100644
--- a/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py
+++ b/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py
@@ -79,7 +79,8 @@ def argsparser():
         "--device",
         type=str,
         default="GPU",
-        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU",
+        help=
+        "Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU",
     )
     parser.add_argument(
         "--arch", type=str, default="YOLOv5", help="architectures name.")
@@ -180,8 +181,9 @@ def draw_box(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
 
         txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
         cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
-        cv2.rectangle(img, (x0, y0 + 1), (
-            x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])), color, -1)
+        cv2.rectangle(img, (x0, y0 + 1), (x0 + txt_size[0] + 1,
+                                          y0 + int(1.5 * txt_size[1])), color,
+                      -1)
         cv2.putText(
             img,
             text, (x0, y0 + txt_size[1]),
@@ -288,8 +290,8 @@ def load_predictor(
             dynamic_shape_file = os.path.join(FLAGS.model_path,
                                               "dynamic_shape.txt")
             if os.path.exists(dynamic_shape_file):
-                config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file,
-                                                           True)
+                config.enable_tuned_tensorrt_dynamic_shape(
+                    dynamic_shape_file, True)
                 print("trt set dynamic shape done!")
             else:
                 config.collect_shape_range_info(dynamic_shape_file)
@@ -315,7 +317,8 @@ def eval(predictor, val_loader, anno_file, rerun_flag=False):
     input_names = predictor.get_input_names()
     output_names = predictor.get_output_names()
     boxes_tensor = predictor.get_output_handle(output_names[0])
-    for batch_id, data in enumerate(val_loader):
+    for batch_id, data in tqdm(
+            enumerate(val_loader), total=len(val_loader), desc='Evaluating'):
         data_all = {k: np.array(v) for k, v in data.items()}
         inputs = {}
         if FLAGS.arch == "YOLOv6":
@@ -345,7 +348,7 @@ def eval(predictor, val_loader, anno_file, rerun_flag=False):
         cpu_mems += cpu_mem
         gpu_mems += gpu_mem
         if batch_id % 100 == 0:
-            print("Eval iter:", batch_id)
+            # print("Eval iter:", batch_id)
             sys.stdout.flush()
     print("[Benchmark]Avg cpu_mem:{} MB, avg gpu_mem: {} MB".format(
         cpu_mems / sample_nums, gpu_mems / sample_nums))

From ba43c20918338f01305cd1aa75239fbea21bdcca Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 15 Jan 2024 09:26:11 +0000
Subject: [PATCH 4/4] aa

---
 .../detection/configs/ppyoloe_s_qat_dis.yaml  | 54 +++++++++------
 .../detection/configs/yolo_reader.yml         |  4 +-
 .../nlp/configs/pp-minilm/auto/afqmc.yaml     | 26 +++++--
 .../nlp/configs/uie/uie_base.yaml             | 37 +++++-----
 example/auto_compression/nlp/run.py           |  2 +
 .../detection/configs/picodet_s_analysis.yaml | 16 ++---
 .../configs/picodet_s_analyzed_ptq.yaml       |  4 +-
 .../detection/configs/picodet_s_ptq.yaml      |  8 +--
 .../detection/configs/ppyoloe_s_ptq.yaml      |  8 +--
 .../detection/eval.py                         | 15 ++--
 .../pytorch_yolo_series/README.md             | 69 ++++++++++++++++++-
 setup.py                                      | 23 ++++---
 12 files changed, 183 insertions(+), 83 deletions(-)

diff --git a/example/auto_compression/detection/configs/ppyoloe_s_qat_dis.yaml b/example/auto_compression/detection/configs/ppyoloe_s_qat_dis.yaml
index 3f6ade72b..60edb8db8 100644
--- a/example/auto_compression/detection/configs/ppyoloe_s_qat_dis.yaml
+++ b/example/auto_compression/detection/configs/ppyoloe_s_qat_dis.yaml
@@ -8,27 +8,39 @@ Global:
   model_filename: model.pdmodel
   params_filename: model.pdiparams
 
-Distillation:
-  alpha: 1.0
-  loss: soft_label
+# Distillation:
+#   alpha: 1.0
+#   loss: soft_label
 
-QuantAware:
-  onnx_format: true
-  use_pact: true
-  activation_quantize_type: 'moving_average_abs_max'
-  quantize_op_types:
-  - conv2d
-  - depthwise_conv2d
+# QuantAware:
+#   onnx_format: true
+#   use_pact: true
+#   activation_quantize_type: 'moving_average_abs_max'
+#   quantize_op_types:
+#   - conv2d
+#   - depthwise_conv2d
 
-TrainConfig:
-  train_iter: 5000
-  eval_iter: 1000
-  learning_rate:  
-    type: CosineAnnealingDecay
-    learning_rate: 0.00003
-    T_max: 6000
-  optimizer_builder:
-    optimizer: 
-      type: SGD
-    weight_decay: 4.0e-05
+# TrainConfig:
+#   train_iter: 5000
+#   eval_iter: 1000
+#   learning_rate:  
+#     type: CosineAnnealingDecay
+#     learning_rate: 0.00003
+#     T_max: 6000
+#   optimizer_builder:
+#     optimizer: 
+#       type: SGD
+#     weight_decay: 4.0e-05
+QuantPost:
+    batch_size: 32
+    batch_nums: None
+    algo: 'hist'
+    hist_percent: 0.999
+    bias_correct: False
+    recon_level: None
+    regions: None
+    epochs: 20
+    lr: 0.1
+    simulate_activation_quant: False
+    skip_tensor_list: None
 
diff --git a/example/auto_compression/detection/configs/yolo_reader.yml b/example/auto_compression/detection/configs/yolo_reader.yml
index d10614530..6e013c1b9 100644
--- a/example/auto_compression/detection/configs/yolo_reader.yml
+++ b/example/auto_compression/detection/configs/yolo_reader.yml
@@ -6,13 +6,13 @@ TrainDataset:
   !COCODataSet
     image_dir: train2017
     anno_path: annotations/instances_train2017.json
-    dataset_dir: dataset/coco/
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 EvalDataset:
   !COCODataSet
     image_dir: val2017
     anno_path: annotations/instances_val2017.json
-    dataset_dir: dataset/coco/
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 worker_num: 0
 
diff --git a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml
index 9c9f58826..8244c90c3 100644
--- a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml
+++ b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml
@@ -6,11 +6,20 @@ Global:
   dataset: clue
   batch_size: 16
   max_seq_length: 128
-TransformerPrune:
-  pruned_ratio: 0.25
-HyperParameterOptimization:
+
+
+# 蒸馏
 Distillation:
-QuantPost:
+  teacher_model_dir: ./afqmc
+  teacher_model_filename: inference.pdmodel
+  teacher_params_filename: inference.pdiparams
+
+# 剪枝参数
+# 剪枝参数包括剪枝算法和裁剪度
+Prune:
+  prune_algo: transformer_pruner
+  pruned_ratio: 0.25
+
 TrainConfig:
   epochs: 6
   eval_iter: 1070
@@ -20,3 +29,12 @@ TrainConfig:
       type: AdamW
     weight_decay: 0.01
   origin_metric: 0.7403
+
+
+# 离线量化
+QuantPost:
+  activation_bits: 8
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+  weight_bits: 8
diff --git a/example/auto_compression/nlp/configs/uie/uie_base.yaml b/example/auto_compression/nlp/configs/uie/uie_base.yaml
index 484f62899..36873084f 100644
--- a/example/auto_compression/nlp/configs/uie/uie_base.yaml
+++ b/example/auto_compression/nlp/configs/uie/uie_base.yaml
@@ -2,21 +2,24 @@ Global:
   model_dir: ./UIE
   model_filename: inference.pdmodel
   params_filename: inference.pdiparams
-  batch_size: 1
-  max_seq_length: 512
-  train_data: ./data/train.txt
-  dev_data: ./data/dev.txt
-TrainConfig:
-  epochs: 200
-  eval_iter: 100
-  learning_rate: 1.0e-5
-  optimizer_builder:
-    optimizer:
-      type: AdamW
-    weight_decay: 0.01
+  task_name: afqmc
+  dataset: clue
+  batch_size: 16
+  max_seq_length: 128
 
-QuantAware:
-  onnx_format: True
-Distillation:
-  alpha: 1.0
-  loss: l2
+
+HyperParameterOptimization:
+  batch_num:
+  - 4
+  - 16
+  bias_correct:
+  - true
+  hist_percent:
+  - 0.999
+  - 0.99999
+  max_quant_count: 20
+  ptq_algo:
+  - KL
+  - hist
+  weight_quantize_type:
+  - channel_wise_abs_max
\ No newline at end of file
diff --git a/example/auto_compression/nlp/run.py b/example/auto_compression/nlp/run.py
index 1f6fa5403..5bfac56db 100644
--- a/example/auto_compression/nlp/run.py
+++ b/example/auto_compression/nlp/run.py
@@ -17,6 +17,8 @@
 from paddlenlp.metrics import Mcc, PearsonAndSpearman
 from paddleslim.common import load_config
 from paddleslim.auto_compression.compressor import AutoCompression
+import sys
+sys.setrecursionlimit(1500)  # 设置一个更高的限制，例如 1500
 
 
 def argsparser():
diff --git a/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml b/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml
index d3d6944c2..16a134c87 100644
--- a/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml
+++ b/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml
@@ -1,12 +1,12 @@
 input_list: ['image', 'scale_factor']
-model_dir: ./picodet_s_416_coco_lcnet/
+model_dir: ./picodet_s_416_coco_lcnet
 model_filename: model.pdmodel
 params_filename: model.pdiparams
 save_dir: ./analysis_results
 metric: COCO
 num_classes: 80
 plot_hist: True
-get_target_quant_model: False
+get_target_quant_model: None
 target_metric: None
 
 PTQ:
@@ -22,15 +22,15 @@ EvalDataset:
   !COCODataSet
     image_dir: val2017
     anno_path: annotations/instances_val2017.json
-    dataset_dir: /dataset/coco/
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 # Small Dataset to accelerate analysis
 # If not exist, delete the dict of FastEvalDataset
-FastEvalDataset:
-  !COCODataSet
-    image_dir: val2017
-    anno_path: annotations/small_instances_val2017.json
-    dataset_dir: /dataset/coco/
+# FastEvalDataset:
+#   !COCODataSet
+#     image_dir: val2017
+#     anno_path: annotations/small_instances_val2017.json
+#     dataset_dir: /dataset/coco/
 
 
 eval_height: &eval_height 416
diff --git a/example/post_training_quantization/detection/configs/picodet_s_analyzed_ptq.yaml b/example/post_training_quantization/detection/configs/picodet_s_analyzed_ptq.yaml
index 54aa3cb9c..6c3ea4721 100644
--- a/example/post_training_quantization/detection/configs/picodet_s_analyzed_ptq.yaml
+++ b/example/post_training_quantization/detection/configs/picodet_s_analyzed_ptq.yaml
@@ -12,13 +12,13 @@ TrainDataset:
   !COCODataSet
     image_dir: train2017
     anno_path: annotations/instances_train2017.json
-    dataset_dir: /paddle/dataset/coco/
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 EvalDataset:
   !COCODataSet
     image_dir: val2017
     anno_path: annotations/instances_val2017.json
-    dataset_dir: /paddle/dataset/coco/
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 eval_height: &eval_height 416
 eval_width: &eval_width 416
diff --git a/example/post_training_quantization/detection/configs/picodet_s_ptq.yaml b/example/post_training_quantization/detection/configs/picodet_s_ptq.yaml
index 005c0d46c..a1c5cb70a 100644
--- a/example/post_training_quantization/detection/configs/picodet_s_ptq.yaml
+++ b/example/post_training_quantization/detection/configs/picodet_s_ptq.yaml
@@ -1,5 +1,5 @@
 input_list: ['image', 'scale_factor']
-model_dir: ./picodet_s_416_coco_lcnet/
+model_dir: ./picodet_s_analyzed_ptq_out
 model_filename: model.pdmodel
 params_filename: model.pdiparams
 skip_tensor_list: None
@@ -12,13 +12,13 @@ TrainDataset:
   !COCODataSet
     image_dir: train2017
     anno_path: annotations/instances_train2017.json
-    dataset_dir: /dataset/coco/
+    dataset_dir:  /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 EvalDataset:
   !COCODataSet
     image_dir: val2017
     anno_path: annotations/instances_val2017.json
-    dataset_dir: /dataset/coco/
+    dataset_dir:  /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 eval_height: &eval_height 416
 eval_width: &eval_width 416
@@ -34,5 +34,5 @@ EvalReader:
   - Resize: {interp: 2, target_size: *eval_size, keep_ratio: False}
   - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
   - Permute: {}
-  batch_size: 32
+  batch_size: 16
 
diff --git a/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml b/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml
index 3c8752652..5fcf7212d 100644
--- a/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml
+++ b/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml
@@ -1,4 +1,4 @@
-input_list: ['image']
+input_list: ['image', 'scale_factor']
 arch: PPYOLOE    # When export exclude_nms=True, need set arch: PPYOLOE
 model_dir: ./ppyoloe_crn_s_300e_coco
 model_filename: model.pdmodel
@@ -12,13 +12,13 @@ TrainDataset:
   !COCODataSet
     image_dir: train2017
     anno_path: annotations/instances_train2017.json
-    dataset_dir: /dataset/coco/
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 EvalDataset:
   !COCODataSet
     image_dir: val2017
     anno_path: annotations/instances_val2017.json
-    dataset_dir: /dataset/coco/
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 worker_num: 0
 
@@ -29,4 +29,4 @@ EvalReader:
     - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
     - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
     - Permute: {}
-  batch_size: 32
\ No newline at end of file
+  batch_size: 16
\ No newline at end of file
diff --git a/example/post_training_quantization/detection/eval.py b/example/post_training_quantization/detection/eval.py
index f8e1342d5..47fe16225 100644
--- a/example/post_training_quantization/detection/eval.py
+++ b/example/post_training_quantization/detection/eval.py
@@ -97,10 +97,11 @@ def eval():
                 if k in config['input_list'].keys():
                     data_input[config['input_list'][k]] = np.array(v)
 
-        outs = exe.run(val_program,
-                       feed=data_input,
-                       fetch_list=fetch_targets,
-                       return_numpy=False)
+        outs = exe.run(
+            val_program,
+            feed=data_input,
+            fetch_list=fetch_targets,
+            return_numpy=False)
         res = {}
         if 'arch' in config and config['arch'] == 'keypoint':
             res = keypoint_post_process(data, data_input, exe, val_program,
@@ -112,6 +113,7 @@ def eval():
         else:
             for out in outs:
                 v = np.array(out)
+                # print("v",v)
                 if len(v.shape) > 1:
                     res['bbox'] = v
                 else:
@@ -130,9 +132,8 @@ def main():
 
     dataset = config['EvalDataset']
     global val_loader
-    val_loader = create('EvalReader')(config['EvalDataset'],
-                                      config['worker_num'],
-                                      return_list=True)
+    val_loader = create('EvalReader')(
+        config['EvalDataset'], config['worker_num'], return_list=True)
     metric = None
     if config['metric'] == 'COCO':
         clsid2catid = {v: k for k, v in dataset.catid2clsid.items()}
diff --git a/example/post_training_quantization/pytorch_yolo_series/README.md b/example/post_training_quantization/pytorch_yolo_series/README.md
index 4bb4d304f..63a7d96c1 100755
--- a/example/post_training_quantization/pytorch_yolo_series/README.md
+++ b/example/post_training_quantization/pytorch_yolo_series/README.md
@@ -122,7 +122,7 @@ python eval.py --config_path=./configs/yolov5s_ptq.yaml
 #### 3.6 提高离线量化精度
 
 ###### 3.6.1 量化分析工具
-本节介绍如何使用量化分析工具提升离线量化精度。离线量化功能仅需使用少量数据，且使用简单、能快速得到量化模型，但往往会造成较大的精度损失。PaddleSlim提供量化分析工具，会使用接口```paddleslim.quant.AnalysisPTQ```，可视化展示出不适合量化的层，通过跳过这些层，提高离线量化模型精度。```paddleslim.quant.AnalysisPTQ```详解见[AnalysisPTQ.md](../../../docs/zh_cn/tutorials/quant/AnalysisPTQ.md)。
+本节介绍如何使用量化分析工具提升离线量化精度。离线量化功能仅需使用少量数据，且使用简单、能快速得到量化模型，但往往会造成较大的精度损失。PaddleSlim提供量化分析工具，会使用接口```paddleslim.quant.AnalysisPTQ```，可视化展示出不适合量化的层，通过跳过这些层，提高离线量化模型精度。```paddleslim.quant.AnalysisPTQ```详解见[AnalysisPTQ.md](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/quant/post_training_quantization.md)。
 
 
 由于YOLOv6离线量化效果较差，以YOLOv6为例，量化分析工具具体使用方法如下：
@@ -207,7 +207,70 @@ python fine_tune.py --config_path=./configs/yolov6s_fine_tune.yaml --simulate_ac
 
 ## 4.预测部署
 预测部署可参考[YOLO系列模型自动压缩示例](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/auto_compression/pytorch_yolo_series)
-
-
+量化模型在GPU上可以使用TensorRT进行加速，在CPU上可以使用MKLDNN进行加速。
+| 参数名 |  含义  |
+| model_path | inference模型文件所在路径，该目录下需要有文件model.pdmodel和params.pdiparams两个文件 |
+| dataset_dir | 指定COCO数据集的目录，这是存储数据集的根目录 |
+| image_file | 如果只测试单张图片效果，直接根据image_file指定图片路径 |
+| val_image_dir | COCO数据集中验证图像的目录名，默认为val2017 |
+| val_anno_path | 指定COCO数据集的注释(annotation)文件路径，这是包含验证集标注信息的JSON文件，默认为annotations/instances_val2017.json |
+| benchmark | 指定是否运行性能基准测试。如果设置为True，程序将会进行性能测试 |
+| device | 使用GPU或者CPU预测，可选CPU/GPU/XPU，默认设置为GPU |
+| use_trt | 是否使用TensorRT进行预测|
+| use_mkldnn | 是否使用MKL-DNN加速库，注意use_mkldnn与use_gpu同时为True时,将忽略enable_mkldnn,而使用GPU预测|
+| use_dynamic_shape | 是否使用动态形状(dynamic_shape)功能 |
+| precision | fp32/fp16/int8|
+| arch | 指定所使用的模型架构的名称，例如YOLOv5 |
+| img_shape | 指定模型输入的图像尺寸 |
+| batch_size | 指定模型输入的批处理大小 |
+| use_mkldnn | 指定是否使用MKLDNN加速(主要针对CPU)|
+| cpu_threads | 指定在CPU上使用的线程数 |
+
+首先，我们拥有的yolov6.onnx，我们需要把ONNX模型转成paddle模型，具体参考使用[X2Paddle迁移推理模型](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/model_convert/convert_with_x2paddle_cn.html#x2paddle)
+- 安装X2Paddle
+方式一:pip 安装
+```shell
+pip install X2Paddle==1.3.9
+```
+方式二:源码安装
+```shell
+git clone https://github.com/PaddlePaddle/X2Paddle.git
+cd X2Paddle
+python setup.py install
+```
+使用命令将YOLOv6.onnx模型转换成paddle模型
+```shell
+x2paddle --framework=onnx --model=yolov6s.onnx --save_dir=yolov6_model
+```
+- TensorRT Python部署
+使用[paddle_inference_eval.py](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py)部署
+```shell
+python paddle_inference_eval.py --model_path=yolov6_model/inference_model --dataset_dir=datasets/coco --use_trt=True --precision=fp32 --arch=YOLOv6
+```
+执行int8量化
+```shell
+python paddle_inference_eval.py --model_path=yolov6s_ptq_out --dataset_dir==datasets/coco --use_trt=True --precision=int8 --arch=YOLOv6
+```
+- C++部署
+具体可参考[运行PP-YOLOE-l目标检测模型样例](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B/gpu/ppyoloe_crn_l)
+将compile.sh中DEMO_NAME修改为yolov6_test，并且将ppyoloe_crn_l.cc修改为yolov6_test.cc,根据环境修改相关配置库
+运行bash compile.sh编译样例。
+- 运行样例
+-使用原生GPU运行样例(将ONNX模型转成的paddle模型复制到Paddle-Inference-demo/c++/gpu/ppyoloe_crn_l/目录下)
+```shell
+./build/yolov6_test --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams
+```
+- 使用TensorRT FP32运行样例
+```shell
+./build/yolov6_test --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_fp32
+```
+- 使用TensorRT FP16运行样例
+```shell
+./build/yolov6_test --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_fp16
+```
+- 使用TensorRT INT8运行样例
+```shell
+./build/yolov6_test --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_int8
+```
 ## 5.FAQ
 - 如果想对模型进行自动压缩，可进入[YOLO系列模型自动压缩示例](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/auto_compression/pytorch_yolo_series)中进行实验。
diff --git a/setup.py b/setup.py
index bc2842802..a2f84f961 100644
--- a/setup.py
+++ b/setup.py
@@ -22,17 +22,18 @@
 from setuptools import find_packages
 from setuptools import setup
 
-if 'develop' in subprocess.getoutput('git branch'):
-    slim_version = '0.0.0_dev'
-else:
-    tag_list = subprocess.getoutput('git tag').split('\n')
-    if 'rc' in tag_list[-1]:
-        if tag_list[-1].split('rc')[0] in tag_list[-2]:
-            slim_version = tag_list[-2]
-        else:
-            slim_version = tag_list[-1]
-    else:
-        slim_version = tag_list[-1]
+# if 'develop' in subprocess.getoutput('git branch'):
+#     slim_version = '0.0.0_dev'
+# else:
+#     tag_list = subprocess.getoutput('git tag').split('\n')
+#     if 'rc' in tag_list[-1]:
+#         if tag_list[-1].split('rc')[0] in tag_list[-2]:
+#             slim_version = tag_list[-2]
+#         else:
+#             slim_version = tag_list[-1]
+#     else:
+#         slim_version = tag_list[-1]
+slim_version = '2.6.0'
 
 with open('./requirements.txt') as f:
     setup_requires = f.read().splitlines()