From 603e0b8a6835aa22d9e92990c38307dc3c2f524d Mon Sep 17 00:00:00 2001 From: ceci3 Date: Tue, 19 Dec 2023 18:30:32 +0800 Subject: [PATCH 1/3] support nf4 channel wise quant & fix bug when blocksize>512 (#1817) --- csrc/lc/dequantize_blockwise.cu | 84 ++++++++++++++++++++--- csrc/lc/quantize_blockwise.cu | 115 ++++++++++++++++++++++++-------- 2 files changed, 162 insertions(+), 37 deletions(-) diff --git a/csrc/lc/dequantize_blockwise.cu b/csrc/lc/dequantize_blockwise.cu index 8046c34ac..0bf76a163 100644 --- a/csrc/lc/dequantize_blockwise.cu +++ b/csrc/lc/dequantize_blockwise.cu @@ -201,7 +201,6 @@ template __global__ void kDequantizeBlockwise(const floa //template __global__ void kDequantizeBlockwise<__nv_bfloat16, 512, 64, 8, NF4>(const float *code, const unsigned char * A, const float * absmax, __nv_bfloat16 *out, int blocksize, int n); - template void dequantize_blockwise(const float *code, const unsigned char *A, const float *absmax, T *out, int blocksize, int n) { int num_blocks = n/blocksize; @@ -226,6 +225,50 @@ template void dequantize_blockwise(const float *code, const unsigned //template void dequantize_blockwise<__nv_bfloat16, FP4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n); //template void dequantize_blockwise<__nv_bfloat16, NF4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n); +template +__global__ void kDequantizeChannelwise(const unsigned char* A, + const float *absmax, + float *out, + int n, + int cout) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + + int num = n / 2; + //int part_n = num / cout; + for (int i = idx; i < num; i += blockDim.x * gridDim.x) { + float local_absmax = absmax[i%cout]; + int idx = 2*(i/cout)* cout + i%cout; + switch(DATA_TYPE) + { + case FP4: + out[i*2 + i%cout] = dDequantizeFP4Tree(A[i] >> 4, local_absmax); + out[i*2 + cout + i%cout] = dDequantizeFP4Tree(A[i] & 0x0F, local_absmax); + break; + case NF4: + out[idx] = dDequantizeNF4(A[i] >> 4)* local_absmax; + out[idx + cout] = dDequantizeNF4(A[i] & 0x0F)* local_absmax; + break; + } + __syncthreads(); + } +} + +template void dequantize_channelwise(const unsigned char *A, const float *absmax, T *out, int n, int cout) +{ + int max_threads = 1024; + int64_t block_size = + std::min(static_cast(n), + static_cast(max_threads/ 4)); + + const int64_t max_blocks = + std::max(((max_threads - 1) / block_size + 1), static_cast(1)); + const int64_t grid_size = + std::min(max_blocks, (n + block_size - 1) / block_size); + + kDequantizeChannelwise<<>>(A, absmax, out, n, cout); + CUDA_CHECK_RETURN(cudaPeekAtLastError()); +} + std::vector DequantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, const paddle::Tensor& absmax, int blocksize, std::string quant_type) { int64_t input_numel = input.numel(); int n = input_numel; @@ -234,23 +277,44 @@ std::vector DequantizeBlockwise(const paddle::Tensor& input, con out_shape = {input_numel * 2, 1}; n = n * 2; } + if (blocksize == -1) { + out_shape = {input.shape()[0] * 2, input.shape()[1]}; + } auto out = paddle::empty(out_shape, paddle::DataType::FLOAT32, input.place()); - if (quant_type == "8bit") - dequantize_blockwise(code.data(), input.data(), absmax.data(), out.data(), blocksize, n); - else if (quant_type == "nf4") - dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); - else if (quant_type == "fp4") - dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); - else - PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. "); + if (blocksize == -1) { + if (quant_type == "8bit") + PD_THROW("blocksize is -1 only support NF4 and FP4."); + else + blocksize = n / absmax.numel() * 2; + + int cout = input.shape()[1]; + if (quant_type == "nf4") + dequantize_channelwise(input.data(), absmax.data(), out.data(), n, cout); + else if (quant_type == "fp4") + dequantize_channelwise(input.data(), absmax.data(), out.data(), n, cout); + else + PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. "); + } else { + if (quant_type == "8bit") + dequantize_blockwise(code.data(), input.data(), absmax.data(), out.data(), blocksize, n); + else if (quant_type == "nf4") + dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); + else if (quant_type == "fp4") + dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); + else + PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. "); + } return {out}; }; std::vector> GetDequantizeBlockwiseInferShape(const std::vector& input_shape, const std::vector& code_shape, const std::vector& abs_max_shape, int blocksize, std::string quant_type){ int64_t first_shape = input_shape[0] * input_shape[1] * 2; if (quant_type != "8bit") - return {{first_shape, 1}}; + if (blocksize != -1) + return {{first_shape, 1}}; + else + return {{input_shape[0] * 2, input_shape[1]}}; else return {input_shape}; } diff --git a/csrc/lc/quantize_blockwise.cu b/csrc/lc/quantize_blockwise.cu index d4f6ff2ca..e8e55b9d8 100644 --- a/csrc/lc/quantize_blockwise.cu +++ b/csrc/lc/quantize_blockwise.cu @@ -279,6 +279,7 @@ __global__ void kQuantizeBlockwise(const float * code, const T * __restrict__ A, #pragma unroll NUM_PER_TH for(int j = 0; j < NUM_PER_TH/2; j++) { + packed_4bit = 0; packed_4bit |= dQuantizeNF4(((float)vals[2*j])*local_abs_max) << 4; packed_4bit |= dQuantizeNF4(((float)vals[2*j+1])*local_abs_max); qvals[j] = packed_4bit; @@ -360,9 +361,39 @@ MAKE_kQuantizeBlockwise(__nv_bfloat16, 256, 2, NF4) MAKE_kQuantizeBlockwise(__nv_bfloat16, 128, 2, NF4) MAKE_kQuantizeBlockwise(__nv_bfloat16, 64, 2, NF4) +template +__global__ void kQuantizeChannelwise(const float *code, + const T* A, + unsigned char* out, + float *absmax, + int n, + int cout) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + + int num = n / 2; + for (int i = idx; i < num; i += blockDim.x * gridDim.x) { + int idx = 2*(i/cout)* cout + i%cout; + float local_absmax = absmax[i %cout]; + float inv_local_absmax = 1.0f/local_absmax; + unsigned char packed_4bit = 0; + switch(DATA_TYPE) + { + case FP4: + packed_4bit |= dQuantizeFP4(((float)A[idx])*inv_local_absmax) << 4; + packed_4bit |= dQuantizeFP4(((float)A[idx+cout])*inv_local_absmax); + out[i] = packed_4bit; + break; + case NF4: + packed_4bit |= dQuantizeNF4(((float)A[idx])*inv_local_absmax) << 4; + packed_4bit |= dQuantizeNF4(((float)A[idx+cout])*inv_local_absmax); + out[i] = packed_4bit; + break; + } + } +} -template void quantize_blockwise(const float *code, const paddle::Tensor& A, float *absmax, unsigned char *out, int blocksize, int n) +template void quantize_blockwise(const float *code, const paddle::Tensor& A, paddle::Tensor& absmax, unsigned char *out, int blocksize, int n, int channelwise) { typedef PDTraits traits_; typedef typename traits_::DataType DataType_; @@ -372,22 +403,43 @@ template void quantize_blockwise(const float num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1; const DataType_* A_data = reinterpret_cast(A.data()); - if(blocksize == 4096) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 2048) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 1024) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 512) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 256) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 128) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 64) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else - PD_THROW("only support blocksize is [64, 128, 256, 512, 1024, 2048, 4096]."); + if (channelwise == 0) { + if(blocksize == 4096) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 2048) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 1024) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 512) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 256) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 128) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 64) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + } + else { + if (DATA_TYPE == General8bit) + PD_THROW("blocksize is -1 only support NF4 and FP4."); + + int cout = A.shape()[1]; + int max_threads = 1024; + + absmax = A.abs().max({0}); + + int64_t block_size = + std::min(static_cast(n), + static_cast(max_threads/ 4)); + + const int64_t max_blocks = + std::max(((max_threads - 1) / block_size + 1), static_cast(1)); + const int64_t grid_size = + std::min(max_blocks, (n + block_size - 1) / block_size); + + kQuantizeChannelwise<<>>( + code, A_data, out, absmax.data(), n, cout); + } CUDA_CHECK_RETURN(cudaPeekAtLastError()); @@ -395,38 +447,44 @@ template void quantize_blockwise(const float std::vector QuantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, int blocksize, std::string quant_type) { int n = input.numel(); + int channelwise = 0; std::vector out_shape = input.shape(); if (quant_type != "8bit") { // 4bit out_shape = {(n + 1) / 2, 1}; } + if (blocksize == -1){ + blocksize = input.shape()[0]; + out_shape = {input.shape()[0]/2, input.shape()[1]}; + channelwise = 1; + } auto out = paddle::empty(out_shape, paddle::DataType::UINT8, input.place()); int64_t absmax_shape = n / blocksize; auto absmax = paddle::empty({absmax_shape}, paddle::DataType::FLOAT32, input.place()); switch(input.type()) { case paddle::DataType::FLOAT32: if (quant_type == "8bit") - quantize_blockwise(code.data(), input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(code.data(), input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "nf4") { - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); } else if (quant_type == "fp4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); return {out, absmax}; case paddle::DataType::FLOAT16: if (quant_type == "8bit") - quantize_blockwise(code.data(), input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(code.data(), input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "nf4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "fp4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); return {out, absmax}; case paddle::DataType::BFLOAT16: if (quant_type == "8bit") - quantize_blockwise(code.data(), input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(code.data(), input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "nf4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "fp4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); return {out, absmax}; default: @@ -440,7 +498,10 @@ std::vector QuantizeBlockwise(const paddle::Tensor& input, const std::vector> GetQuantizeBlockwiseInferShape(const std::vector& input_shape, const std::vector& code_shape, int blocksize, std::string quant_type){ int64_t first_shape = (input_shape[0] * input_shape[1] + 1) / 2; if (quant_type != "8bit") - return {{first_shape, 1}}; + if (blocksize != -1) + return {{first_shape, 1}}; + else + return {{input_shape[0]/2, input_shape[1]}}; else return {input_shape}; } From 40d48cc92aed5ed65493e52586db35b259d4b112 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Fri, 29 Dec 2023 07:36:58 +0000 Subject: [PATCH 2/3] =?UTF-8?q?=E8=B7=91=E9=80=9A+=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E6=B5=8B=E9=80=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../detection/paddle_inference_eval.py | 94 +++++++++++++++++-- 1 file changed, 84 insertions(+), 10 deletions(-) diff --git a/example/auto_compression/detection/paddle_inference_eval.py b/example/auto_compression/detection/paddle_inference_eval.py index d2e12afd1..c66ce7113 100644 --- a/example/auto_compression/detection/paddle_inference_eval.py +++ b/example/auto_compression/detection/paddle_inference_eval.py @@ -82,9 +82,15 @@ def argsparser(): parser.add_argument("--img_shape", type=int, default=640, help="input_size") parser.add_argument( '--include_nms', - type=bool, - default=True, + type=str, + default='True', help="Whether include nms or not.") + # 是否用来测速 + parser.add_argument( + '--speed', + type=str, + default='True', + help="if speed is True, it will print the inference time.") return parser @@ -238,9 +244,11 @@ def load_predictor( config = Config( os.path.join(model_dir, "model.pdmodel"), os.path.join(model_dir, "model.pdiparams")) + + config.enable_memory_optim() if device == "GPU": # initial GPU memory(M), device ID - config.enable_use_gpu(200, 0) + config.enable_use_gpu(1000, 0) # optimize graph and fuse op config.switch_ir_optim(True) else: @@ -260,7 +268,7 @@ def load_predictor( } if precision in precision_map.keys() and use_trt: config.enable_tensorrt_engine( - workspace_size=(1 << 25) * batch_size, + workspace_size=(1 << 30) * batch_size, max_batch_size=batch_size, min_subgraph_size=min_subgraph_size, precision_mode=precision_map[precision], @@ -297,6 +305,7 @@ def predict_image(predictor, img, scale_factor = image_preprocess(image_file, image_shape) inputs = {} inputs["image"] = img + if FLAGS.include_nms: inputs['scale_factor'] = scale_factor input_names = predictor.get_input_names() @@ -354,6 +363,9 @@ def eval(predictor, val_loader, metric, rerun_flag=False): input_names = predictor.get_input_names() output_names = predictor.get_output_names() boxes_tensor = predictor.get_output_handle(output_names[0]) + print("output_names:", output_names) + print("Number of outputs:", len(output_names)) + print("FLAGS.include_nms:", FLAGS.include_nms) if FLAGS.include_nms: boxes_num = predictor.get_output_handle(output_names[1]) for batch_id, data in enumerate(val_loader): @@ -374,27 +386,79 @@ def eval(predictor, val_loader, metric, rerun_flag=False): time_min = min(time_min, timed) time_max = max(time_max, timed) predict_time += timed - if not FLAGS.include_nms: + # print("FLAGS.include_nms:", FLAGS.include_nms) + # print("FLAGS.speed:", FLAGS.speed) + # 如果include_nms为false且flags.speed为True,则走PPYOLOEPostProcess + if not FLAGS.include_nms and FLAGS.speed: + # print("nms为True的时候走了PPYOLOEPostProcess") postprocess = PPYOLOEPostProcess( score_threshold=0.3, nms_threshold=0.6) res = postprocess(np_boxes, data_all['scale_factor']) - else: + #如果include_nms为false且flags.speed为False,则跳过 + elif not FLAGS.include_nms and not FLAGS.speed: + continue + #如果include_nms,则直接返回 + elif FLAGS.include_nms: + # print("nms为False的时候直接返回") res = {'bbox': np_boxes, 'bbox_num': np_boxes_num} metric.update(data_all, res) if batch_id % 100 == 0: print("Eval iter:", batch_id) sys.stdout.flush() metric.accumulate() - metric.log() + if not FLAGS.speed: + metric.log() map_res = metric.get_results() metric.reset() time_avg = predict_time / sample_nums print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format( round(time_min * 1000, 2), round(time_max * 1000, 1), round(time_avg * 1000, 1))) - print("[Benchmark] COCO mAP: {}".format(map_res["bbox"][0])) + if not FLAGS.speed: + print("[Benchmark] COCO mAP: {}".format(map_res["bbox"][0])) sys.stdout.flush() +def inference_time(predictor, val_loader, metric, rerun_flag=False): + cpu_mems, gpu_mems = 0, 0 + predict_time = 0.0 + time_min = float("inf") + time_max = float("-inf") + sample_nums = len(val_loader) + input_names = predictor.get_input_names() + output_names = predictor.get_output_names() + boxes_tensor = predictor.get_output_handle(output_names[0]) + print("output_names:", output_names) + print("Number of outputs:", len(output_names)) + print("FLAGS.include_nms:", FLAGS.include_nms) + if FLAGS.include_nms: + boxes_num = predictor.get_output_handle(output_names[1]) + + for batch_id, data in enumerate(val_loader): + data_all = {k: np.array(v) for k, v in data.items()} + for i, _ in enumerate(input_names): + input_tensor = predictor.get_input_handle(input_names[i]) + input_tensor.copy_from_cpu(data_all[input_names[i]]) + paddle.device.cuda.synchronize() + start_time = time.time() + predictor.run() + # np_boxes = boxes_tensor.copy_to_cpu() + if FLAGS.include_nms: + np_boxes_num = boxes_num.copy_to_cpu() + if rerun_flag: + return + end_time = time.time() + timed = end_time - start_time + time_min = min(time_min, timed) + time_max = max(time_max, timed) + predict_time += timed + # print("FLAGS.include_nms:", FLAGS.include_nms) + # print("FLAGS.speed:", FLAGS.speed) + # 如果include_nms为false且flags.speed为True,则走PPYOLOEPostProcess + time_avg = predict_time / sample_nums + print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format( + round(time_min * 1000, 2), + round(time_max * 1000, 1), round(time_avg * 1000, 1))) + sys.stdout.flush() def main(): """ @@ -421,7 +485,7 @@ def main(): repeats=repeats) else: reader_cfg = load_config(FLAGS.reader_config) - + dataset = reader_cfg["EvalDataset"] global val_loader val_loader = create("EvalReader")( @@ -432,7 +496,10 @@ def main(): anno_file = dataset.get_anno() metric = COCOMetric( anno_file=anno_file, clsid2catid=clsid2catid, IouType="bbox") - eval(predictor, val_loader, metric, rerun_flag=rerun_flag) + if not FLAGS.speed: + eval(predictor, val_loader, metric, rerun_flag=rerun_flag) + else: + inference_time(predictor, val_loader, metric, rerun_flag=rerun_flag) if rerun_flag: print( @@ -444,6 +511,13 @@ def main(): paddle.enable_static() parser = argsparser() FLAGS = parser.parse_args() + if FLAGS.include_nms=='True': + FLAGS.include_nms = True + else: + FLAGS.include_nms = False + + print('**************main****************') + print(FLAGS) # DataLoader need run on cpu paddle.set_device("cpu") From 26a6d919b79cf114f9b3486aa13f5a78d7258977 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 3 Jan 2024 09:07:34 +0000 Subject: [PATCH 3/3] =?UTF-8?q?=E6=9C=80=E7=BB=88=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../detection/configs/ppyoloe_l_qat_dis.yaml | 2 +- .../detection/configs/yolo_reader.yml | 4 +- .../detection/paddle_inference_eval.py | 94 ++++--------------- .../detection/post_process.py | 2 +- 4 files changed, 21 insertions(+), 81 deletions(-) diff --git a/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml b/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml index 7102142ed..017e88b44 100644 --- a/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml +++ b/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml @@ -2,7 +2,7 @@ Global: reader_config: configs/yolo_reader.yml arch: PPYOLOE - include_nms: True + include_nms: False Evaluation: True model_dir: ./ppyoloe_crn_l_300e_coco model_filename: model.pdmodel diff --git a/example/auto_compression/detection/configs/yolo_reader.yml b/example/auto_compression/detection/configs/yolo_reader.yml index d10614530..6e013c1b9 100644 --- a/example/auto_compression/detection/configs/yolo_reader.yml +++ b/example/auto_compression/detection/configs/yolo_reader.yml @@ -6,13 +6,13 @@ TrainDataset: !COCODataSet image_dir: train2017 anno_path: annotations/instances_train2017.json - dataset_dir: dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ EvalDataset: !COCODataSet image_dir: val2017 anno_path: annotations/instances_val2017.json - dataset_dir: dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ worker_num: 0 diff --git a/example/auto_compression/detection/paddle_inference_eval.py b/example/auto_compression/detection/paddle_inference_eval.py index c66ce7113..3702bd49a 100644 --- a/example/auto_compression/detection/paddle_inference_eval.py +++ b/example/auto_compression/detection/paddle_inference_eval.py @@ -18,6 +18,7 @@ import sys import cv2 import numpy as np +from tqdm import tqdm import paddle from paddle.inference import Config @@ -85,12 +86,12 @@ def argsparser(): type=str, default='True', help="Whether include nms or not.") - # 是否用来测速 parser.add_argument( - '--speed', - type=str, - default='True', - help="if speed is True, it will print the inference time.") + "--trt_calib_mode", + type=bool, + default=False, + help="If the model is produced by TRT offline quantitative " + "calibration, trt_calib_mode need to set True.") return parser @@ -214,8 +215,9 @@ def load_predictor( use_mkldnn=False, batch_size=1, device="CPU", - min_subgraph_size=3, + min_subgraph_size=4, use_dynamic_shape=False, + trt_calib_mode=False, trt_min_shape=1, trt_max_shape=1280, trt_opt_shape=640, @@ -273,7 +275,7 @@ def load_predictor( min_subgraph_size=min_subgraph_size, precision_mode=precision_map[precision], use_static=True, - use_calib_mode=False, ) + use_calib_mode=False) if use_dynamic_shape: dynamic_shape_file = os.path.join(FLAGS.model_path, @@ -363,12 +365,9 @@ def eval(predictor, val_loader, metric, rerun_flag=False): input_names = predictor.get_input_names() output_names = predictor.get_output_names() boxes_tensor = predictor.get_output_handle(output_names[0]) - print("output_names:", output_names) - print("Number of outputs:", len(output_names)) - print("FLAGS.include_nms:", FLAGS.include_nms) if FLAGS.include_nms: boxes_num = predictor.get_output_handle(output_names[1]) - for batch_id, data in enumerate(val_loader): + for batch_id, data in tqdm(enumerate(val_loader), total=len(val_loader), desc='Evaluating'): data_all = {k: np.array(v) for k, v in data.items()} for i, _ in enumerate(input_names): input_tensor = predictor.get_input_handle(input_names[i]) @@ -386,79 +385,26 @@ def eval(predictor, val_loader, metric, rerun_flag=False): time_min = min(time_min, timed) time_max = max(time_max, timed) predict_time += timed - # print("FLAGS.include_nms:", FLAGS.include_nms) - # print("FLAGS.speed:", FLAGS.speed) - # 如果include_nms为false且flags.speed为True,则走PPYOLOEPostProcess - if not FLAGS.include_nms and FLAGS.speed: - # print("nms为True的时候走了PPYOLOEPostProcess") + if not FLAGS.include_nms: postprocess = PPYOLOEPostProcess( score_threshold=0.3, nms_threshold=0.6) res = postprocess(np_boxes, data_all['scale_factor']) - #如果include_nms为false且flags.speed为False,则跳过 - elif not FLAGS.include_nms and not FLAGS.speed: - continue - #如果include_nms,则直接返回 - elif FLAGS.include_nms: - # print("nms为False的时候直接返回") + else: res = {'bbox': np_boxes, 'bbox_num': np_boxes_num} metric.update(data_all, res) if batch_id % 100 == 0: - print("Eval iter:", batch_id) sys.stdout.flush() metric.accumulate() - if not FLAGS.speed: - metric.log() + metric.log() map_res = metric.get_results() metric.reset() time_avg = predict_time / sample_nums print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format( round(time_min * 1000, 2), round(time_max * 1000, 1), round(time_avg * 1000, 1))) - if not FLAGS.speed: - print("[Benchmark] COCO mAP: {}".format(map_res["bbox"][0])) + print("[Benchmark] COCO mAP: {}".format(map_res["bbox"][0])) sys.stdout.flush() -def inference_time(predictor, val_loader, metric, rerun_flag=False): - cpu_mems, gpu_mems = 0, 0 - predict_time = 0.0 - time_min = float("inf") - time_max = float("-inf") - sample_nums = len(val_loader) - input_names = predictor.get_input_names() - output_names = predictor.get_output_names() - boxes_tensor = predictor.get_output_handle(output_names[0]) - print("output_names:", output_names) - print("Number of outputs:", len(output_names)) - print("FLAGS.include_nms:", FLAGS.include_nms) - if FLAGS.include_nms: - boxes_num = predictor.get_output_handle(output_names[1]) - - for batch_id, data in enumerate(val_loader): - data_all = {k: np.array(v) for k, v in data.items()} - for i, _ in enumerate(input_names): - input_tensor = predictor.get_input_handle(input_names[i]) - input_tensor.copy_from_cpu(data_all[input_names[i]]) - paddle.device.cuda.synchronize() - start_time = time.time() - predictor.run() - # np_boxes = boxes_tensor.copy_to_cpu() - if FLAGS.include_nms: - np_boxes_num = boxes_num.copy_to_cpu() - if rerun_flag: - return - end_time = time.time() - timed = end_time - start_time - time_min = min(time_min, timed) - time_max = max(time_max, timed) - predict_time += timed - # print("FLAGS.include_nms:", FLAGS.include_nms) - # print("FLAGS.speed:", FLAGS.speed) - # 如果include_nms为false且flags.speed为True,则走PPYOLOEPostProcess - time_avg = predict_time / sample_nums - print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format( - round(time_min * 1000, 2), - round(time_max * 1000, 1), round(time_avg * 1000, 1))) - sys.stdout.flush() def main(): """ @@ -485,7 +431,6 @@ def main(): repeats=repeats) else: reader_cfg = load_config(FLAGS.reader_config) - dataset = reader_cfg["EvalDataset"] global val_loader val_loader = create("EvalReader")( @@ -496,11 +441,9 @@ def main(): anno_file = dataset.get_anno() metric = COCOMetric( anno_file=anno_file, clsid2catid=clsid2catid, IouType="bbox") - if not FLAGS.speed: - eval(predictor, val_loader, metric, rerun_flag=rerun_flag) - else: - inference_time(predictor, val_loader, metric, rerun_flag=rerun_flag) - + + eval(predictor, val_loader, metric, rerun_flag=rerun_flag) + if rerun_flag: print( "***** Collect dynamic shape done, Please rerun the program to get correct results. *****" @@ -516,9 +459,6 @@ def main(): else: FLAGS.include_nms = False - print('**************main****************') - print(FLAGS) - # DataLoader need run on cpu paddle.set_device("cpu") diff --git a/example/auto_compression/detection/post_process.py b/example/auto_compression/detection/post_process.py index eea2f0195..76fc5ac1b 100644 --- a/example/auto_compression/detection/post_process.py +++ b/example/auto_compression/detection/post_process.py @@ -122,7 +122,7 @@ def _non_max_suppression(self, prediction, scale_factor): picked_labels.extend([class_index] * box_probs.shape[0]) if len(picked_box_probs) == 0: - out_boxes_list.append(np.empty((0, 4))) + out_boxes_list.append(np.empty((0, 6))) else: picked_box_probs = np.concatenate(picked_box_probs)