From 603e0b8a6835aa22d9e92990c38307dc3c2f524d Mon Sep 17 00:00:00 2001
From: ceci3 <ceci3@users.noreply.github.com>
Date: Tue, 19 Dec 2023 18:30:32 +0800
Subject: [PATCH 1/3] support nf4 channel wise quant & fix bug when
 blocksize>512 (#1817)

---
 csrc/lc/dequantize_blockwise.cu |  84 ++++++++++++++++++++---
 csrc/lc/quantize_blockwise.cu   | 115 ++++++++++++++++++++++++--------
 2 files changed, 162 insertions(+), 37 deletions(-)
diff --git a/csrc/lc/dequantize_blockwise.cu b/csrc/lc/dequantize_blockwise.cu
index 8046c34ac..0bf76a163 100644
--- a/csrc/lc/dequantize_blockwise.cu
+++ b/csrc/lc/dequantize_blockwise.cu
@@ -201,7 +201,6 @@ template __global__ void kDequantizeBlockwise<float, 512, 64, 8, NF4>(const floa
 //template __global__ void kDequantizeBlockwise<__nv_bfloat16, 512, 64, 8, NF4>(const float *code, const unsigned char * A, const float * absmax, __nv_bfloat16 *out, int blocksize, int n);
 
 
-
 template<typename T, int DATA_TYPE> void dequantize_blockwise(const float *code, const unsigned char *A, const float *absmax, T *out, int blocksize, int n)
 {
   int num_blocks = n/blocksize;
@@ -226,6 +225,50 @@ template void dequantize_blockwise<float, NF4>(const float *code, const unsigned
 //template void dequantize_blockwise<__nv_bfloat16, FP4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n);
 //template void dequantize_blockwise<__nv_bfloat16, NF4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n);
 
+template <typename T, int DATA_TYPE>
+__global__ void kDequantizeChannelwise(const unsigned char* A,
+                                      const float *absmax,
+                                      float *out,
+                                      int n,
+                                      int cout) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  int num = n / 2;
+  //int part_n = num / cout;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    float local_absmax = absmax[i%cout];
+    int idx = 2*(i/cout)* cout + i%cout;
+    switch(DATA_TYPE)
+    {
+        case FP4:
+            out[i*2 + i%cout] = dDequantizeFP4Tree(A[i] >> 4, local_absmax);
+            out[i*2 + cout + i%cout] = dDequantizeFP4Tree(A[i] & 0x0F, local_absmax);
+            break;
+        case NF4:
+            out[idx] = dDequantizeNF4(A[i] >> 4)* local_absmax;
+            out[idx + cout] = dDequantizeNF4(A[i] & 0x0F)* local_absmax;
+            break;
+    }
+    __syncthreads();
+  }
+}
+
+template<typename T, int DATA_TYPE> void dequantize_channelwise(const unsigned char *A, const float *absmax, T *out, int n, int cout)
+{
+  int max_threads = 1024; 
+  int64_t block_size =
+      std::min(static_cast<int64_t>(n),
+               static_cast<int64_t>(max_threads/ 4));
+
+  const int64_t max_blocks =
+      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+  const int64_t grid_size =
+      std::min(max_blocks, (n + block_size - 1) / block_size);
+
+  kDequantizeChannelwise<T, DATA_TYPE><<<grid_size, block_size>>>(A, absmax, out, n, cout);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
 std::vector<paddle::Tensor> DequantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, const paddle::Tensor& absmax, int blocksize, std::string quant_type) {
     int64_t input_numel = input.numel();
     int n = input_numel;
@@ -234,23 +277,44 @@ std::vector<paddle::Tensor> DequantizeBlockwise(const paddle::Tensor& input, con
         out_shape = {input_numel * 2, 1};
         n = n * 2;
     }
+    if (blocksize == -1) {
+        out_shape = {input.shape()[0] * 2, input.shape()[1]};
+    }
     auto out = paddle::empty(out_shape, paddle::DataType::FLOAT32, input.place());
 
-    if (quant_type == "8bit")
-        dequantize_blockwise<float, General8bit>(code.data<float>(), input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
-    else if (quant_type == "nf4")
-        dequantize_blockwise<float, NF4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
-    else if (quant_type == "fp4")
-        dequantize_blockwise<float, FP4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
-    else
-        PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. ");
+    if (blocksize == -1) {
+        if (quant_type == "8bit")
+            PD_THROW("blocksize is -1 only support NF4 and FP4.");
+        else
+            blocksize = n / absmax.numel() * 2;
+
+        int cout = input.shape()[1];
+        if (quant_type == "nf4")
+            dequantize_channelwise<float, NF4>(input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), n, cout);
+        else if (quant_type == "fp4")
+            dequantize_channelwise<float, FP4>(input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), n, cout);
+        else
+            PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. ");
+    } else {
+      if (quant_type == "8bit")
+          dequantize_blockwise<float, General8bit>(code.data<float>(), input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
+      else if (quant_type == "nf4")
+          dequantize_blockwise<float, NF4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
+      else if (quant_type == "fp4")
+          dequantize_blockwise<float, FP4>(NULL, input.data<unsigned char>(), absmax.data<float>(), out.data<float>(), blocksize, n);
+      else
+          PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. ");
+    }
     return {out};
 };
 
 std::vector<std::vector<int64_t>> GetDequantizeBlockwiseInferShape(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& code_shape, const std::vector<int64_t>& abs_max_shape, int blocksize, std::string quant_type){
     int64_t first_shape = input_shape[0] * input_shape[1] * 2;
     if (quant_type != "8bit")
-        return {{first_shape, 1}};
+        if (blocksize != -1)
+            return {{first_shape, 1}};
+        else
+            return {{input_shape[0] * 2, input_shape[1]}};
     else
         return {input_shape};
 }
diff --git a/csrc/lc/quantize_blockwise.cu b/csrc/lc/quantize_blockwise.cu
index d4f6ff2ca..e8e55b9d8 100644
--- a/csrc/lc/quantize_blockwise.cu
+++ b/csrc/lc/quantize_blockwise.cu
@@ -279,6 +279,7 @@ __global__ void kQuantizeBlockwise(const float * code, const T * __restrict__ A,
             #pragma unroll NUM_PER_TH
             for(int j = 0; j < NUM_PER_TH/2; j++)
             {
+              packed_4bit = 0;
               packed_4bit |= dQuantizeNF4(((float)vals[2*j])*local_abs_max) << 4;
               packed_4bit |= dQuantizeNF4(((float)vals[2*j+1])*local_abs_max);
               qvals[j] = packed_4bit;
@@ -360,9 +361,39 @@ MAKE_kQuantizeBlockwise(__nv_bfloat16,  256, 2, NF4)
 MAKE_kQuantizeBlockwise(__nv_bfloat16,  128, 2, NF4)
 MAKE_kQuantizeBlockwise(__nv_bfloat16,   64, 2, NF4)
 
+template <typename T, int DATA_TYPE>
+__global__ void kQuantizeChannelwise(const float *code,
+                                      const T* A,
+                                      unsigned char* out,
+                                      float *absmax,
+                                      int n,
+                                      int cout) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  int num = n / 2;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    int idx = 2*(i/cout)* cout + i%cout;
+    float local_absmax = absmax[i %cout];
+    float inv_local_absmax = 1.0f/local_absmax;
 
+    unsigned char packed_4bit = 0;
+    switch(DATA_TYPE)
+    {
+        case FP4:
+            packed_4bit |= dQuantizeFP4(((float)A[idx])*inv_local_absmax) << 4;
+            packed_4bit |= dQuantizeFP4(((float)A[idx+cout])*inv_local_absmax);
+            out[i] = packed_4bit;
+            break;
+        case NF4:
+            packed_4bit |= dQuantizeNF4(((float)A[idx])*inv_local_absmax) << 4;
+            packed_4bit |= dQuantizeNF4(((float)A[idx+cout])*inv_local_absmax);
+            out[i] = packed_4bit;
+            break;
+    }
+  }
+}
 
-template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float *code, const paddle::Tensor& A, float *absmax, unsigned char *out, int blocksize, int n)
+template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float *code, const paddle::Tensor& A, paddle::Tensor& absmax, unsigned char *out, int blocksize, int n, int channelwise)
 {
   typedef PDTraits<D> traits_;
   typedef typename traits_::DataType DataType_;
@@ -372,22 +403,43 @@ template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float
   num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
 
   const DataType_* A_data = reinterpret_cast<const DataType_*>(A.data<data_t>()); 
-  if(blocksize == 4096)
-    kQuantizeBlockwise<DataType_, 4096, 4, 0><<<num_blocks, 1024>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 2048)
-    kQuantizeBlockwise<DataType_, 2048, 4, DATA_TYPE><<<num_blocks, 512>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 1024)
-    kQuantizeBlockwise<DataType_, 1024, 4, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 512)
-    kQuantizeBlockwise<DataType_, 512, 2, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 256)
-    kQuantizeBlockwise<DataType_, 256, 2, DATA_TYPE><<<num_blocks, 128>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 128)
-    kQuantizeBlockwise<DataType_, 128, 2, DATA_TYPE><<<num_blocks, 64>>>(code, A_data, absmax, out, n);
-  else if(blocksize == 64)
-    kQuantizeBlockwise<DataType_, 64, 2, DATA_TYPE><<<num_blocks, 32>>>(code, A_data, absmax, out, n);
-  else
-    PD_THROW("only support blocksize is [64, 128, 256, 512, 1024, 2048, 4096].");
+  if (channelwise == 0) {
+    if(blocksize == 4096)
+      kQuantizeBlockwise<DataType_, 4096, 4, 0><<<num_blocks, 1024>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 2048)
+      kQuantizeBlockwise<DataType_, 2048, 4, DATA_TYPE><<<num_blocks, 512>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 1024)
+      kQuantizeBlockwise<DataType_, 1024, 4, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 512)
+      kQuantizeBlockwise<DataType_, 512, 2, DATA_TYPE><<<num_blocks, 256>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 256)
+      kQuantizeBlockwise<DataType_, 256, 2, DATA_TYPE><<<num_blocks, 128>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 128)
+      kQuantizeBlockwise<DataType_, 128, 2, DATA_TYPE><<<num_blocks, 64>>>(code, A_data, absmax.data<float>(), out, n);
+    else if(blocksize == 64)
+      kQuantizeBlockwise<DataType_, 64, 2, DATA_TYPE><<<num_blocks, 32>>>(code, A_data, absmax.data<float>(), out, n);
+  }
+  else {
+    if (DATA_TYPE == General8bit)
+        PD_THROW("blocksize is -1 only support NF4 and FP4.");
+
+    int cout = A.shape()[1];
+    int max_threads = 1024; 
+
+    absmax = A.abs().max({0});
+
+    int64_t block_size =
+        std::min(static_cast<int64_t>(n),
+                 static_cast<int64_t>(max_threads/ 4));
+
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (n + block_size - 1) / block_size);
+
+    kQuantizeChannelwise<DataType_, DATA_TYPE><<<grid_size, block_size, 0>>>(
+      code, A_data, out, absmax.data<float>(), n, cout);
+  }
 
 
   CUDA_CHECK_RETURN(cudaPeekAtLastError());
@@ -395,38 +447,44 @@ template <paddle::DataType D, int DATA_TYPE> void quantize_blockwise(const float
 
 std::vector<paddle::Tensor> QuantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, int blocksize, std::string quant_type) {
     int n = input.numel();
+    int channelwise = 0;
     std::vector<int64_t> out_shape = input.shape();
     if (quant_type != "8bit") { // 4bit
         out_shape = {(n + 1) / 2, 1};
     }
+    if (blocksize == -1){
+        blocksize = input.shape()[0];
+        out_shape = {input.shape()[0]/2, input.shape()[1]};
+        channelwise = 1;
+    }
     auto out = paddle::empty(out_shape, paddle::DataType::UINT8, input.place());
     int64_t absmax_shape = n / blocksize;
     auto absmax = paddle::empty({absmax_shape}, paddle::DataType::FLOAT32, input.place());
     switch(input.type()) {
         case paddle::DataType::FLOAT32:
             if (quant_type == "8bit")
-                quantize_blockwise<paddle::DataType::FLOAT32, General8bit>(code.data<float>(), input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT32, General8bit>(code.data<float>(), input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "nf4") {
-                quantize_blockwise<paddle::DataType::FLOAT32, NF4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT32, NF4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             }
             else if (quant_type == "fp4")
-                quantize_blockwise<paddle::DataType::FLOAT32, FP4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT32, FP4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             return {out, absmax};
         case paddle::DataType::FLOAT16:
             if (quant_type == "8bit")
-                quantize_blockwise<paddle::DataType::FLOAT16, General8bit>(code.data<float>(), input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT16, General8bit>(code.data<float>(), input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "nf4")
-                quantize_blockwise<paddle::DataType::FLOAT16, NF4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT16, NF4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "fp4")
-                quantize_blockwise<paddle::DataType::FLOAT16, FP4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::FLOAT16, FP4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             return {out, absmax};
         case paddle::DataType::BFLOAT16:
             if (quant_type == "8bit")
-                quantize_blockwise<paddle::DataType::BFLOAT16, General8bit>(code.data<float>(), input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::BFLOAT16, General8bit>(code.data<float>(), input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "nf4")
-                quantize_blockwise<paddle::DataType::BFLOAT16, NF4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::BFLOAT16, NF4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             else if (quant_type == "fp4")
-                quantize_blockwise<paddle::DataType::BFLOAT16, FP4>(NULL, input, absmax.data<float>(), out.data<unsigned char>(), blocksize, n);
+                quantize_blockwise<paddle::DataType::BFLOAT16, FP4>(NULL, input, absmax, out.data<unsigned char>(), blocksize, n, channelwise);
             return {out, absmax};
 
         default:
@@ -440,7 +498,10 @@ std::vector<paddle::Tensor> QuantizeBlockwise(const paddle::Tensor& input, const
 std::vector<std::vector<int64_t>> GetQuantizeBlockwiseInferShape(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& code_shape, int blocksize, std::string quant_type){
     int64_t first_shape = (input_shape[0] * input_shape[1] + 1) / 2;
     if (quant_type != "8bit")
-        return {{first_shape, 1}};
+        if (blocksize != -1)
+          return {{first_shape, 1}};
+        else 
+          return {{input_shape[0]/2, input_shape[1]}};
     else
         return {input_shape};
 }

From 40d48cc92aed5ed65493e52586db35b259d4b112 Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Fri, 29 Dec 2023 07:36:58 +0000
Subject: [PATCH 2/3] =?UTF-8?q?=E8=B7=91=E9=80=9A+=E6=94=AF=E6=8C=81?=
 =?UTF-8?q?=E6=B5=8B=E9=80=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../detection/paddle_inference_eval.py        | 94 +++++++++++++++++--
 1 file changed, 84 insertions(+), 10 deletions(-)

diff --git a/example/auto_compression/detection/paddle_inference_eval.py b/example/auto_compression/detection/paddle_inference_eval.py
index d2e12afd1..c66ce7113 100644
--- a/example/auto_compression/detection/paddle_inference_eval.py
+++ b/example/auto_compression/detection/paddle_inference_eval.py
@@ -82,9 +82,15 @@ def argsparser():
     parser.add_argument("--img_shape", type=int, default=640, help="input_size")
     parser.add_argument(
         '--include_nms',
-        type=bool,
-        default=True,
+        type=str,
+        default='True',
         help="Whether include nms or not.")
+    # 是否用来测速
+    parser.add_argument(
+        '--speed',
+        type=str,
+        default='True',
+        help="if speed is True, it will print the inference time.")
 
     return parser
 
@@ -238,9 +244,11 @@ def load_predictor(
     config = Config(
         os.path.join(model_dir, "model.pdmodel"),
         os.path.join(model_dir, "model.pdiparams"))
+    
+    config.enable_memory_optim()
     if device == "GPU":
         # initial GPU memory(M), device ID
-        config.enable_use_gpu(200, 0)
+        config.enable_use_gpu(1000, 0)
         # optimize graph and fuse op
         config.switch_ir_optim(True)
     else:
@@ -260,7 +268,7 @@ def load_predictor(
     }
     if precision in precision_map.keys() and use_trt:
         config.enable_tensorrt_engine(
-            workspace_size=(1 << 25) * batch_size,
+            workspace_size=(1 << 30) * batch_size,
             max_batch_size=batch_size,
             min_subgraph_size=min_subgraph_size,
             precision_mode=precision_map[precision],
@@ -297,6 +305,7 @@ def predict_image(predictor,
     img, scale_factor = image_preprocess(image_file, image_shape)
     inputs = {}
     inputs["image"] = img
+    
     if FLAGS.include_nms:
         inputs['scale_factor'] = scale_factor
     input_names = predictor.get_input_names()
@@ -354,6 +363,9 @@ def eval(predictor, val_loader, metric, rerun_flag=False):
     input_names = predictor.get_input_names()
     output_names = predictor.get_output_names()
     boxes_tensor = predictor.get_output_handle(output_names[0])
+    print("output_names:", output_names)
+    print("Number of outputs:", len(output_names))
+    print("FLAGS.include_nms:", FLAGS.include_nms)
     if FLAGS.include_nms:
         boxes_num = predictor.get_output_handle(output_names[1])
     for batch_id, data in enumerate(val_loader):
@@ -374,27 +386,79 @@ def eval(predictor, val_loader, metric, rerun_flag=False):
         time_min = min(time_min, timed)
         time_max = max(time_max, timed)
         predict_time += timed
-        if not FLAGS.include_nms:
+        # print("FLAGS.include_nms:", FLAGS.include_nms)
+        # print("FLAGS.speed:", FLAGS.speed)
+        # 如果include_nms为false且flags.speed为True，则走PPYOLOEPostProcess
+        if not FLAGS.include_nms and FLAGS.speed: 
+            # print("nms为True的时候走了PPYOLOEPostProcess")
             postprocess = PPYOLOEPostProcess(
                 score_threshold=0.3, nms_threshold=0.6)
             res = postprocess(np_boxes, data_all['scale_factor'])
-        else:
+        #如果include_nms为false且flags.speed为False,则跳过
+        elif not FLAGS.include_nms and not FLAGS.speed:
+            continue
+        #如果include_nms,则直接返回
+        elif FLAGS.include_nms:
+            # print("nms为False的时候直接返回")
             res = {'bbox': np_boxes, 'bbox_num': np_boxes_num}
         metric.update(data_all, res)
         if batch_id % 100 == 0:
             print("Eval iter:", batch_id)
             sys.stdout.flush()
     metric.accumulate()
-    metric.log()
+    if not FLAGS.speed:
+        metric.log()
     map_res = metric.get_results()
     metric.reset()
     time_avg = predict_time / sample_nums
     print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format(
         round(time_min * 1000, 2),
         round(time_max * 1000, 1), round(time_avg * 1000, 1)))
-    print("[Benchmark] COCO mAP: {}".format(map_res["bbox"][0]))
+    if not FLAGS.speed:
+        print("[Benchmark] COCO mAP: {}".format(map_res["bbox"][0]))
     sys.stdout.flush()
 
+def inference_time(predictor, val_loader, metric, rerun_flag=False):
+    cpu_mems, gpu_mems = 0, 0
+    predict_time = 0.0
+    time_min = float("inf")
+    time_max = float("-inf")
+    sample_nums = len(val_loader)
+    input_names = predictor.get_input_names()
+    output_names = predictor.get_output_names()
+    boxes_tensor = predictor.get_output_handle(output_names[0])
+    print("output_names:", output_names)
+    print("Number of outputs:", len(output_names))
+    print("FLAGS.include_nms:", FLAGS.include_nms)
+    if FLAGS.include_nms:
+        boxes_num = predictor.get_output_handle(output_names[1])
+    
+    for batch_id, data in enumerate(val_loader):
+        data_all = {k: np.array(v) for k, v in data.items()}
+        for i, _ in enumerate(input_names):
+            input_tensor = predictor.get_input_handle(input_names[i])
+            input_tensor.copy_from_cpu(data_all[input_names[i]])
+        paddle.device.cuda.synchronize()
+        start_time = time.time()
+        predictor.run()
+        # np_boxes = boxes_tensor.copy_to_cpu()
+        if FLAGS.include_nms:
+            np_boxes_num = boxes_num.copy_to_cpu()
+        if rerun_flag:
+            return
+        end_time = time.time()
+        timed = end_time - start_time
+        time_min = min(time_min, timed)
+        time_max = max(time_max, timed)
+        predict_time += timed
+        # print("FLAGS.include_nms:", FLAGS.include_nms)
+        # print("FLAGS.speed:", FLAGS.speed)
+        # 如果include_nms为false且flags.speed为True，则走PPYOLOEPostProcess
+    time_avg = predict_time / sample_nums
+    print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format(
+        round(time_min * 1000, 2),
+        round(time_max * 1000, 1), round(time_avg * 1000, 1)))
+    sys.stdout.flush()
 
 def main():
     """
@@ -421,7 +485,7 @@ def main():
             repeats=repeats)
     else:
         reader_cfg = load_config(FLAGS.reader_config)
-
+        
         dataset = reader_cfg["EvalDataset"]
         global val_loader
         val_loader = create("EvalReader")(
@@ -432,7 +496,10 @@ def main():
         anno_file = dataset.get_anno()
         metric = COCOMetric(
             anno_file=anno_file, clsid2catid=clsid2catid, IouType="bbox")
-        eval(predictor, val_loader, metric, rerun_flag=rerun_flag)
+        if not FLAGS.speed:
+            eval(predictor, val_loader, metric, rerun_flag=rerun_flag)
+        else:
+            inference_time(predictor, val_loader, metric, rerun_flag=rerun_flag)
 
     if rerun_flag:
         print(
@@ -444,6 +511,13 @@ def main():
     paddle.enable_static()
     parser = argsparser()
     FLAGS = parser.parse_args()
+    if FLAGS.include_nms=='True':
+        FLAGS.include_nms = True 
+    else:
+        FLAGS.include_nms = False   
+
+    print('**************main****************')
+    print(FLAGS)
 
     # DataLoader need run on cpu
     paddle.set_device("cpu")

From 26a6d919b79cf114f9b3486aa13f5a78d7258977 Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Wed, 3 Jan 2024 09:07:34 +0000
Subject: [PATCH 3/3] =?UTF-8?q?=E6=9C=80=E7=BB=88=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../detection/configs/ppyoloe_l_qat_dis.yaml  |  2 +-
 .../detection/configs/yolo_reader.yml         |  4 +-
 .../detection/paddle_inference_eval.py        | 94 ++++---------------
 .../detection/post_process.py                 |  2 +-
 4 files changed, 21 insertions(+), 81 deletions(-)

diff --git a/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml b/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml
index 7102142ed..017e88b44 100644
--- a/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml
+++ b/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml
@@ -2,7 +2,7 @@
 Global:
   reader_config: configs/yolo_reader.yml
   arch: PPYOLOE
-  include_nms: True
+  include_nms: False
   Evaluation: True
   model_dir: ./ppyoloe_crn_l_300e_coco
   model_filename: model.pdmodel
diff --git a/example/auto_compression/detection/configs/yolo_reader.yml b/example/auto_compression/detection/configs/yolo_reader.yml
index d10614530..6e013c1b9 100644
--- a/example/auto_compression/detection/configs/yolo_reader.yml
+++ b/example/auto_compression/detection/configs/yolo_reader.yml
@@ -6,13 +6,13 @@ TrainDataset:
   !COCODataSet
     image_dir: train2017
     anno_path: annotations/instances_train2017.json
-    dataset_dir: dataset/coco/
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 EvalDataset:
   !COCODataSet
     image_dir: val2017
     anno_path: annotations/instances_val2017.json
-    dataset_dir: dataset/coco/
+    dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/
 
 worker_num: 0
 
diff --git a/example/auto_compression/detection/paddle_inference_eval.py b/example/auto_compression/detection/paddle_inference_eval.py
index c66ce7113..3702bd49a 100644
--- a/example/auto_compression/detection/paddle_inference_eval.py
+++ b/example/auto_compression/detection/paddle_inference_eval.py
@@ -18,6 +18,7 @@
 import sys
 import cv2
 import numpy as np
+from tqdm import tqdm
 
 import paddle
 from paddle.inference import Config
@@ -85,12 +86,12 @@ def argsparser():
         type=str,
         default='True',
         help="Whether include nms or not.")
-    # 是否用来测速
     parser.add_argument(
-        '--speed',
-        type=str,
-        default='True',
-        help="if speed is True, it will print the inference time.")
+        "--trt_calib_mode",
+        type=bool,
+        default=False,
+        help="If the model is produced by TRT offline quantitative "
+        "calibration, trt_calib_mode need to set True.")
 
     return parser
 
@@ -214,8 +215,9 @@ def load_predictor(
         use_mkldnn=False,
         batch_size=1,
         device="CPU",
-        min_subgraph_size=3,
+        min_subgraph_size=4,
         use_dynamic_shape=False,
+        trt_calib_mode=False,
         trt_min_shape=1,
         trt_max_shape=1280,
         trt_opt_shape=640,
@@ -273,7 +275,7 @@ def load_predictor(
             min_subgraph_size=min_subgraph_size,
             precision_mode=precision_map[precision],
             use_static=True,
-            use_calib_mode=False, )
+            use_calib_mode=False)
 
         if use_dynamic_shape:
             dynamic_shape_file = os.path.join(FLAGS.model_path,
@@ -363,12 +365,9 @@ def eval(predictor, val_loader, metric, rerun_flag=False):
     input_names = predictor.get_input_names()
     output_names = predictor.get_output_names()
     boxes_tensor = predictor.get_output_handle(output_names[0])
-    print("output_names:", output_names)
-    print("Number of outputs:", len(output_names))
-    print("FLAGS.include_nms:", FLAGS.include_nms)
     if FLAGS.include_nms:
         boxes_num = predictor.get_output_handle(output_names[1])
-    for batch_id, data in enumerate(val_loader):
+    for batch_id, data in tqdm(enumerate(val_loader), total=len(val_loader), desc='Evaluating'):
         data_all = {k: np.array(v) for k, v in data.items()}
         for i, _ in enumerate(input_names):
             input_tensor = predictor.get_input_handle(input_names[i])
@@ -386,79 +385,26 @@ def eval(predictor, val_loader, metric, rerun_flag=False):
         time_min = min(time_min, timed)
         time_max = max(time_max, timed)
         predict_time += timed
-        # print("FLAGS.include_nms:", FLAGS.include_nms)
-        # print("FLAGS.speed:", FLAGS.speed)
-        # 如果include_nms为false且flags.speed为True，则走PPYOLOEPostProcess
-        if not FLAGS.include_nms and FLAGS.speed: 
-            # print("nms为True的时候走了PPYOLOEPostProcess")
+        if not FLAGS.include_nms:
             postprocess = PPYOLOEPostProcess(
                 score_threshold=0.3, nms_threshold=0.6)
             res = postprocess(np_boxes, data_all['scale_factor'])
-        #如果include_nms为false且flags.speed为False,则跳过
-        elif not FLAGS.include_nms and not FLAGS.speed:
-            continue
-        #如果include_nms,则直接返回
-        elif FLAGS.include_nms:
-            # print("nms为False的时候直接返回")
+        else:
             res = {'bbox': np_boxes, 'bbox_num': np_boxes_num}
         metric.update(data_all, res)
         if batch_id % 100 == 0:
-            print("Eval iter:", batch_id)
             sys.stdout.flush()
     metric.accumulate()
-    if not FLAGS.speed:
-        metric.log()
+    metric.log()
     map_res = metric.get_results()
     metric.reset()
     time_avg = predict_time / sample_nums
     print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format(
         round(time_min * 1000, 2),
         round(time_max * 1000, 1), round(time_avg * 1000, 1)))
-    if not FLAGS.speed:
-        print("[Benchmark] COCO mAP: {}".format(map_res["bbox"][0]))
+    print("[Benchmark] COCO mAP: {}".format(map_res["bbox"][0]))
     sys.stdout.flush()
 
-def inference_time(predictor, val_loader, metric, rerun_flag=False):
-    cpu_mems, gpu_mems = 0, 0
-    predict_time = 0.0
-    time_min = float("inf")
-    time_max = float("-inf")
-    sample_nums = len(val_loader)
-    input_names = predictor.get_input_names()
-    output_names = predictor.get_output_names()
-    boxes_tensor = predictor.get_output_handle(output_names[0])
-    print("output_names:", output_names)
-    print("Number of outputs:", len(output_names))
-    print("FLAGS.include_nms:", FLAGS.include_nms)
-    if FLAGS.include_nms:
-        boxes_num = predictor.get_output_handle(output_names[1])
-    
-    for batch_id, data in enumerate(val_loader):
-        data_all = {k: np.array(v) for k, v in data.items()}
-        for i, _ in enumerate(input_names):
-            input_tensor = predictor.get_input_handle(input_names[i])
-            input_tensor.copy_from_cpu(data_all[input_names[i]])
-        paddle.device.cuda.synchronize()
-        start_time = time.time()
-        predictor.run()
-        # np_boxes = boxes_tensor.copy_to_cpu()
-        if FLAGS.include_nms:
-            np_boxes_num = boxes_num.copy_to_cpu()
-        if rerun_flag:
-            return
-        end_time = time.time()
-        timed = end_time - start_time
-        time_min = min(time_min, timed)
-        time_max = max(time_max, timed)
-        predict_time += timed
-        # print("FLAGS.include_nms:", FLAGS.include_nms)
-        # print("FLAGS.speed:", FLAGS.speed)
-        # 如果include_nms为false且flags.speed为True，则走PPYOLOEPostProcess
-    time_avg = predict_time / sample_nums
-    print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format(
-        round(time_min * 1000, 2),
-        round(time_max * 1000, 1), round(time_avg * 1000, 1)))
-    sys.stdout.flush()
 
 def main():
     """
@@ -485,7 +431,6 @@ def main():
             repeats=repeats)
     else:
         reader_cfg = load_config(FLAGS.reader_config)
-        
         dataset = reader_cfg["EvalDataset"]
         global val_loader
         val_loader = create("EvalReader")(
@@ -496,11 +441,9 @@ def main():
         anno_file = dataset.get_anno()
         metric = COCOMetric(
             anno_file=anno_file, clsid2catid=clsid2catid, IouType="bbox")
-        if not FLAGS.speed:
-            eval(predictor, val_loader, metric, rerun_flag=rerun_flag)
-        else:
-            inference_time(predictor, val_loader, metric, rerun_flag=rerun_flag)
-
+        
+        eval(predictor, val_loader, metric, rerun_flag=rerun_flag)
+       
     if rerun_flag:
         print(
             "***** Collect dynamic shape done, Please rerun the program to get correct results. *****"
@@ -516,9 +459,6 @@ def main():
     else:
         FLAGS.include_nms = False   
 
-    print('**************main****************')
-    print(FLAGS)
-
     # DataLoader need run on cpu
     paddle.set_device("cpu")
 
diff --git a/example/auto_compression/detection/post_process.py b/example/auto_compression/detection/post_process.py
index eea2f0195..76fc5ac1b 100644
--- a/example/auto_compression/detection/post_process.py
+++ b/example/auto_compression/detection/post_process.py
@@ -122,7 +122,7 @@ def _non_max_suppression(self, prediction, scale_factor):
                 picked_labels.extend([class_index] * box_probs.shape[0])
 
             if len(picked_box_probs) == 0:
-                out_boxes_list.append(np.empty((0, 4)))
+                out_boxes_list.append(np.empty((0, 6)))
 
             else:
                 picked_box_probs = np.concatenate(picked_box_probs)