From 603e0b8a6835aa22d9e92990c38307dc3c2f524d Mon Sep 17 00:00:00 2001 From: ceci3 Date: Tue, 19 Dec 2023 18:30:32 +0800 Subject: [PATCH 1/4] support nf4 channel wise quant & fix bug when blocksize>512 (#1817) --- csrc/lc/dequantize_blockwise.cu | 84 ++++++++++++++++++++--- csrc/lc/quantize_blockwise.cu | 115 ++++++++++++++++++++++++-------- 2 files changed, 162 insertions(+), 37 deletions(-) diff --git a/csrc/lc/dequantize_blockwise.cu b/csrc/lc/dequantize_blockwise.cu index 8046c34ac..0bf76a163 100644 --- a/csrc/lc/dequantize_blockwise.cu +++ b/csrc/lc/dequantize_blockwise.cu @@ -201,7 +201,6 @@ template __global__ void kDequantizeBlockwise(const floa //template __global__ void kDequantizeBlockwise<__nv_bfloat16, 512, 64, 8, NF4>(const float *code, const unsigned char * A, const float * absmax, __nv_bfloat16 *out, int blocksize, int n); - template void dequantize_blockwise(const float *code, const unsigned char *A, const float *absmax, T *out, int blocksize, int n) { int num_blocks = n/blocksize; @@ -226,6 +225,50 @@ template void dequantize_blockwise(const float *code, const unsigned //template void dequantize_blockwise<__nv_bfloat16, FP4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n); //template void dequantize_blockwise<__nv_bfloat16, NF4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n); +template +__global__ void kDequantizeChannelwise(const unsigned char* A, + const float *absmax, + float *out, + int n, + int cout) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + + int num = n / 2; + //int part_n = num / cout; + for (int i = idx; i < num; i += blockDim.x * gridDim.x) { + float local_absmax = absmax[i%cout]; + int idx = 2*(i/cout)* cout + i%cout; + switch(DATA_TYPE) + { + case FP4: + out[i*2 + i%cout] = dDequantizeFP4Tree(A[i] >> 4, local_absmax); + out[i*2 + cout + i%cout] = dDequantizeFP4Tree(A[i] & 0x0F, local_absmax); + break; + case NF4: + out[idx] = dDequantizeNF4(A[i] >> 4)* local_absmax; + out[idx + cout] = dDequantizeNF4(A[i] & 0x0F)* local_absmax; + break; + } + __syncthreads(); + } +} + +template void dequantize_channelwise(const unsigned char *A, const float *absmax, T *out, int n, int cout) +{ + int max_threads = 1024; + int64_t block_size = + std::min(static_cast(n), + static_cast(max_threads/ 4)); + + const int64_t max_blocks = + std::max(((max_threads - 1) / block_size + 1), static_cast(1)); + const int64_t grid_size = + std::min(max_blocks, (n + block_size - 1) / block_size); + + kDequantizeChannelwise<<>>(A, absmax, out, n, cout); + CUDA_CHECK_RETURN(cudaPeekAtLastError()); +} + std::vector DequantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, const paddle::Tensor& absmax, int blocksize, std::string quant_type) { int64_t input_numel = input.numel(); int n = input_numel; @@ -234,23 +277,44 @@ std::vector DequantizeBlockwise(const paddle::Tensor& input, con out_shape = {input_numel * 2, 1}; n = n * 2; } + if (blocksize == -1) { + out_shape = {input.shape()[0] * 2, input.shape()[1]}; + } auto out = paddle::empty(out_shape, paddle::DataType::FLOAT32, input.place()); - if (quant_type == "8bit") - dequantize_blockwise(code.data(), input.data(), absmax.data(), out.data(), blocksize, n); - else if (quant_type == "nf4") - dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); - else if (quant_type == "fp4") - dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); - else - PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. "); + if (blocksize == -1) { + if (quant_type == "8bit") + PD_THROW("blocksize is -1 only support NF4 and FP4."); + else + blocksize = n / absmax.numel() * 2; + + int cout = input.shape()[1]; + if (quant_type == "nf4") + dequantize_channelwise(input.data(), absmax.data(), out.data(), n, cout); + else if (quant_type == "fp4") + dequantize_channelwise(input.data(), absmax.data(), out.data(), n, cout); + else + PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. "); + } else { + if (quant_type == "8bit") + dequantize_blockwise(code.data(), input.data(), absmax.data(), out.data(), blocksize, n); + else if (quant_type == "nf4") + dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); + else if (quant_type == "fp4") + dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); + else + PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. "); + } return {out}; }; std::vector> GetDequantizeBlockwiseInferShape(const std::vector& input_shape, const std::vector& code_shape, const std::vector& abs_max_shape, int blocksize, std::string quant_type){ int64_t first_shape = input_shape[0] * input_shape[1] * 2; if (quant_type != "8bit") - return {{first_shape, 1}}; + if (blocksize != -1) + return {{first_shape, 1}}; + else + return {{input_shape[0] * 2, input_shape[1]}}; else return {input_shape}; } diff --git a/csrc/lc/quantize_blockwise.cu b/csrc/lc/quantize_blockwise.cu index d4f6ff2ca..e8e55b9d8 100644 --- a/csrc/lc/quantize_blockwise.cu +++ b/csrc/lc/quantize_blockwise.cu @@ -279,6 +279,7 @@ __global__ void kQuantizeBlockwise(const float * code, const T * __restrict__ A, #pragma unroll NUM_PER_TH for(int j = 0; j < NUM_PER_TH/2; j++) { + packed_4bit = 0; packed_4bit |= dQuantizeNF4(((float)vals[2*j])*local_abs_max) << 4; packed_4bit |= dQuantizeNF4(((float)vals[2*j+1])*local_abs_max); qvals[j] = packed_4bit; @@ -360,9 +361,39 @@ MAKE_kQuantizeBlockwise(__nv_bfloat16, 256, 2, NF4) MAKE_kQuantizeBlockwise(__nv_bfloat16, 128, 2, NF4) MAKE_kQuantizeBlockwise(__nv_bfloat16, 64, 2, NF4) +template +__global__ void kQuantizeChannelwise(const float *code, + const T* A, + unsigned char* out, + float *absmax, + int n, + int cout) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + + int num = n / 2; + for (int i = idx; i < num; i += blockDim.x * gridDim.x) { + int idx = 2*(i/cout)* cout + i%cout; + float local_absmax = absmax[i %cout]; + float inv_local_absmax = 1.0f/local_absmax; + unsigned char packed_4bit = 0; + switch(DATA_TYPE) + { + case FP4: + packed_4bit |= dQuantizeFP4(((float)A[idx])*inv_local_absmax) << 4; + packed_4bit |= dQuantizeFP4(((float)A[idx+cout])*inv_local_absmax); + out[i] = packed_4bit; + break; + case NF4: + packed_4bit |= dQuantizeNF4(((float)A[idx])*inv_local_absmax) << 4; + packed_4bit |= dQuantizeNF4(((float)A[idx+cout])*inv_local_absmax); + out[i] = packed_4bit; + break; + } + } +} -template void quantize_blockwise(const float *code, const paddle::Tensor& A, float *absmax, unsigned char *out, int blocksize, int n) +template void quantize_blockwise(const float *code, const paddle::Tensor& A, paddle::Tensor& absmax, unsigned char *out, int blocksize, int n, int channelwise) { typedef PDTraits traits_; typedef typename traits_::DataType DataType_; @@ -372,22 +403,43 @@ template void quantize_blockwise(const float num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1; const DataType_* A_data = reinterpret_cast(A.data()); - if(blocksize == 4096) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 2048) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 1024) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 512) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 256) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 128) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 64) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else - PD_THROW("only support blocksize is [64, 128, 256, 512, 1024, 2048, 4096]."); + if (channelwise == 0) { + if(blocksize == 4096) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 2048) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 1024) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 512) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 256) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 128) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 64) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + } + else { + if (DATA_TYPE == General8bit) + PD_THROW("blocksize is -1 only support NF4 and FP4."); + + int cout = A.shape()[1]; + int max_threads = 1024; + + absmax = A.abs().max({0}); + + int64_t block_size = + std::min(static_cast(n), + static_cast(max_threads/ 4)); + + const int64_t max_blocks = + std::max(((max_threads - 1) / block_size + 1), static_cast(1)); + const int64_t grid_size = + std::min(max_blocks, (n + block_size - 1) / block_size); + + kQuantizeChannelwise<<>>( + code, A_data, out, absmax.data(), n, cout); + } CUDA_CHECK_RETURN(cudaPeekAtLastError()); @@ -395,38 +447,44 @@ template void quantize_blockwise(const float std::vector QuantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, int blocksize, std::string quant_type) { int n = input.numel(); + int channelwise = 0; std::vector out_shape = input.shape(); if (quant_type != "8bit") { // 4bit out_shape = {(n + 1) / 2, 1}; } + if (blocksize == -1){ + blocksize = input.shape()[0]; + out_shape = {input.shape()[0]/2, input.shape()[1]}; + channelwise = 1; + } auto out = paddle::empty(out_shape, paddle::DataType::UINT8, input.place()); int64_t absmax_shape = n / blocksize; auto absmax = paddle::empty({absmax_shape}, paddle::DataType::FLOAT32, input.place()); switch(input.type()) { case paddle::DataType::FLOAT32: if (quant_type == "8bit") - quantize_blockwise(code.data(), input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(code.data(), input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "nf4") { - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); } else if (quant_type == "fp4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); return {out, absmax}; case paddle::DataType::FLOAT16: if (quant_type == "8bit") - quantize_blockwise(code.data(), input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(code.data(), input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "nf4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "fp4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); return {out, absmax}; case paddle::DataType::BFLOAT16: if (quant_type == "8bit") - quantize_blockwise(code.data(), input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(code.data(), input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "nf4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "fp4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); return {out, absmax}; default: @@ -440,7 +498,10 @@ std::vector QuantizeBlockwise(const paddle::Tensor& input, const std::vector> GetQuantizeBlockwiseInferShape(const std::vector& input_shape, const std::vector& code_shape, int blocksize, std::string quant_type){ int64_t first_shape = (input_shape[0] * input_shape[1] + 1) / 2; if (quant_type != "8bit") - return {{first_shape, 1}}; + if (blocksize != -1) + return {{first_shape, 1}}; + else + return {{input_shape[0]/2, input_shape[1]}}; else return {input_shape}; } From a6323eec219fcdc92bf424acd1927729a614ba4f Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 3 Jan 2024 10:51:20 +0000 Subject: [PATCH 2/4] =?UTF-8?q?=E7=9B=AE=E6=A0=87=E6=A3=80=E6=B5=8B?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E8=87=AA=E5=8A=A8=E5=8E=8B=E7=BC=A9=E7=A4=BA?= =?UTF-8?q?=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../detection/configs/picodet_reader.yml | 4 +-- .../detection/configs/ppyoloe_l_qat_dis.yaml | 5 ++- .../detection/paddle_inference_eval.py | 33 ++++++++++++++----- .../detection/post_process.py | 14 ++++---- 4 files changed, 34 insertions(+), 22 deletions(-) diff --git a/example/auto_compression/detection/configs/picodet_reader.yml b/example/auto_compression/detection/configs/picodet_reader.yml index 389673367..cd7ba8029 100644 --- a/example/auto_compression/detection/configs/picodet_reader.yml +++ b/example/auto_compression/detection/configs/picodet_reader.yml @@ -6,13 +6,13 @@ TrainDataset: !COCODataSet image_dir: train2017 anno_path: annotations/instances_train2017.json - dataset_dir: dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ EvalDataset: !COCODataSet image_dir: val2017 anno_path: annotations/instances_val2017.json - dataset_dir: dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ eval_height: &eval_height 416 eval_width: &eval_width 416 diff --git a/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml b/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml index 7102142ed..248a5b8e2 100644 --- a/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml +++ b/example/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml @@ -2,7 +2,7 @@ Global: reader_config: configs/yolo_reader.yml arch: PPYOLOE - include_nms: True + include_nms: False Evaluation: True model_dir: ./ppyoloe_crn_l_300e_coco model_filename: model.pdmodel @@ -30,5 +30,4 @@ TrainConfig: optimizer_builder: optimizer: type: SGD - weight_decay: 4.0e-05 - + weight_decay: 4.0e-05 \ No newline at end of file diff --git a/example/auto_compression/detection/paddle_inference_eval.py b/example/auto_compression/detection/paddle_inference_eval.py index d2e12afd1..b0368bffb 100644 --- a/example/auto_compression/detection/paddle_inference_eval.py +++ b/example/auto_compression/detection/paddle_inference_eval.py @@ -18,6 +18,7 @@ import sys import cv2 import numpy as np +from tqdm import tqdm import paddle from paddle.inference import Config @@ -82,9 +83,15 @@ def argsparser(): parser.add_argument("--img_shape", type=int, default=640, help="input_size") parser.add_argument( '--include_nms', - type=bool, - default=True, + type=str, + default='True', help="Whether include nms or not.") + parser.add_argument( + "--trt_calib_mode", + type=bool, + default=False, + help="If the model is produced by TRT offline quantitative " + "calibration, trt_calib_mode need to set True.") return parser @@ -208,8 +215,9 @@ def load_predictor( use_mkldnn=False, batch_size=1, device="CPU", - min_subgraph_size=3, + min_subgraph_size=4, use_dynamic_shape=False, + trt_calib_mode=False, trt_min_shape=1, trt_max_shape=1280, trt_opt_shape=640, @@ -238,9 +246,11 @@ def load_predictor( config = Config( os.path.join(model_dir, "model.pdmodel"), os.path.join(model_dir, "model.pdiparams")) + + config.enable_memory_optim() if device == "GPU": # initial GPU memory(M), device ID - config.enable_use_gpu(200, 0) + config.enable_use_gpu(1000, 0) # optimize graph and fuse op config.switch_ir_optim(True) else: @@ -260,12 +270,12 @@ def load_predictor( } if precision in precision_map.keys() and use_trt: config.enable_tensorrt_engine( - workspace_size=(1 << 25) * batch_size, + workspace_size=(1 << 30) * batch_size, max_batch_size=batch_size, min_subgraph_size=min_subgraph_size, precision_mode=precision_map[precision], use_static=True, - use_calib_mode=False, ) + use_calib_mode=False) if use_dynamic_shape: dynamic_shape_file = os.path.join(FLAGS.model_path, @@ -297,6 +307,7 @@ def predict_image(predictor, img, scale_factor = image_preprocess(image_file, image_shape) inputs = {} inputs["image"] = img + if FLAGS.include_nms: inputs['scale_factor'] = scale_factor input_names = predictor.get_input_names() @@ -356,7 +367,8 @@ def eval(predictor, val_loader, metric, rerun_flag=False): boxes_tensor = predictor.get_output_handle(output_names[0]) if FLAGS.include_nms: boxes_num = predictor.get_output_handle(output_names[1]) - for batch_id, data in enumerate(val_loader): + for batch_id, data in tqdm( + enumerate(val_loader), total=len(val_loader), desc='Evaluating'): data_all = {k: np.array(v) for k, v in data.items()} for i, _ in enumerate(input_names): input_tensor = predictor.get_input_handle(input_names[i]) @@ -382,7 +394,6 @@ def eval(predictor, val_loader, metric, rerun_flag=False): res = {'bbox': np_boxes, 'bbox_num': np_boxes_num} metric.update(data_all, res) if batch_id % 100 == 0: - print("Eval iter:", batch_id) sys.stdout.flush() metric.accumulate() metric.log() @@ -421,7 +432,6 @@ def main(): repeats=repeats) else: reader_cfg = load_config(FLAGS.reader_config) - dataset = reader_cfg["EvalDataset"] global val_loader val_loader = create("EvalReader")( @@ -432,6 +442,7 @@ def main(): anno_file = dataset.get_anno() metric = COCOMetric( anno_file=anno_file, clsid2catid=clsid2catid, IouType="bbox") + eval(predictor, val_loader, metric, rerun_flag=rerun_flag) if rerun_flag: @@ -444,6 +455,10 @@ def main(): paddle.enable_static() parser = argsparser() FLAGS = parser.parse_args() + if FLAGS.include_nms == 'True': + FLAGS.include_nms = True + else: + FLAGS.include_nms = False # DataLoader need run on cpu paddle.set_device("cpu") diff --git a/example/auto_compression/detection/post_process.py b/example/auto_compression/detection/post_process.py index eea2f0195..4ed79ce73 100644 --- a/example/auto_compression/detection/post_process.py +++ b/example/auto_compression/detection/post_process.py @@ -41,8 +41,7 @@ def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200): rest_boxes = boxes[indexes, :] iou = iou_of( rest_boxes, - np.expand_dims( - current_box, axis=0), ) + np.expand_dims(current_box, axis=0), ) indexes = indexes[iou <= iou_threshold] return box_scores[picked, :] @@ -122,7 +121,7 @@ def _non_max_suppression(self, prediction, scale_factor): picked_labels.extend([class_index] * box_probs.shape[0]) if len(picked_box_probs) == 0: - out_boxes_list.append(np.empty((0, 4))) + out_boxes_list.append(np.empty((0, 6))) else: picked_box_probs = np.concatenate(picked_box_probs) @@ -135,9 +134,8 @@ def _non_max_suppression(self, prediction, scale_factor): # clas score box out_box = np.concatenate( [ - np.expand_dims( - np.array(picked_labels), axis=-1), np.expand_dims( - picked_box_probs[:, 4], axis=-1), + np.expand_dims(np.array(picked_labels), axis=-1), + np.expand_dims(picked_box_probs[:, 4], axis=-1), picked_box_probs[:, :4] ], axis=1) @@ -152,6 +150,6 @@ def _non_max_suppression(self, prediction, scale_factor): return out_boxes_list, box_num_list def __call__(self, outs, scale_factor): - out_boxes_list, box_num_list = self._non_max_suppression(outs, - scale_factor) + out_boxes_list, box_num_list = self._non_max_suppression( + outs, scale_factor) return {'bbox': out_boxes_list, 'bbox_num': box_num_list} From 132c5b6f5f304b07c9f2fbdcbeebf1b8c3c19ff8 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 8 Jan 2024 09:30:42 +0000 Subject: [PATCH 3/4] =?UTF-8?q?paddle=5Finference.py=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../paddle_inference_eval.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py b/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py index a1df31b78..ea5cb975d 100644 --- a/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py +++ b/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py @@ -79,7 +79,8 @@ def argsparser(): "--device", type=str, default="GPU", - help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU", + help= + "Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU", ) parser.add_argument( "--arch", type=str, default="YOLOv5", help="architectures name.") @@ -180,8 +181,9 @@ def draw_box(img, boxes, scores, cls_ids, conf=0.5, class_names=None): txt_size = cv2.getTextSize(text, font, 0.4, 1)[0] cv2.rectangle(img, (x0, y0), (x1, y1), color, 2) - cv2.rectangle(img, (x0, y0 + 1), ( - x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])), color, -1) + cv2.rectangle(img, (x0, y0 + 1), (x0 + txt_size[0] + 1, + y0 + int(1.5 * txt_size[1])), color, + -1) cv2.putText( img, text, (x0, y0 + txt_size[1]), @@ -288,8 +290,8 @@ def load_predictor( dynamic_shape_file = os.path.join(FLAGS.model_path, "dynamic_shape.txt") if os.path.exists(dynamic_shape_file): - config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file, - True) + config.enable_tuned_tensorrt_dynamic_shape( + dynamic_shape_file, True) print("trt set dynamic shape done!") else: config.collect_shape_range_info(dynamic_shape_file) @@ -315,7 +317,8 @@ def eval(predictor, val_loader, anno_file, rerun_flag=False): input_names = predictor.get_input_names() output_names = predictor.get_output_names() boxes_tensor = predictor.get_output_handle(output_names[0]) - for batch_id, data in enumerate(val_loader): + for batch_id, data in tqdm( + enumerate(val_loader), total=len(val_loader), desc='Evaluating'): data_all = {k: np.array(v) for k, v in data.items()} inputs = {} if FLAGS.arch == "YOLOv6": @@ -345,7 +348,7 @@ def eval(predictor, val_loader, anno_file, rerun_flag=False): cpu_mems += cpu_mem gpu_mems += gpu_mem if batch_id % 100 == 0: - print("Eval iter:", batch_id) + # print("Eval iter:", batch_id) sys.stdout.flush() print("[Benchmark]Avg cpu_mem:{} MB, avg gpu_mem: {} MB".format( cpu_mems / sample_nums, gpu_mems / sample_nums)) From ba43c20918338f01305cd1aa75239fbea21bdcca Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 15 Jan 2024 09:26:11 +0000 Subject: [PATCH 4/4] aa --- .../detection/configs/ppyoloe_s_qat_dis.yaml | 54 +++++++++------ .../detection/configs/yolo_reader.yml | 4 +- .../nlp/configs/pp-minilm/auto/afqmc.yaml | 26 +++++-- .../nlp/configs/uie/uie_base.yaml | 37 +++++----- example/auto_compression/nlp/run.py | 2 + .../detection/configs/picodet_s_analysis.yaml | 16 ++--- .../configs/picodet_s_analyzed_ptq.yaml | 4 +- .../detection/configs/picodet_s_ptq.yaml | 8 +-- .../detection/configs/ppyoloe_s_ptq.yaml | 8 +-- .../detection/eval.py | 15 ++-- .../pytorch_yolo_series/README.md | 69 ++++++++++++++++++- setup.py | 23 ++++--- 12 files changed, 183 insertions(+), 83 deletions(-) diff --git a/example/auto_compression/detection/configs/ppyoloe_s_qat_dis.yaml b/example/auto_compression/detection/configs/ppyoloe_s_qat_dis.yaml index 3f6ade72b..60edb8db8 100644 --- a/example/auto_compression/detection/configs/ppyoloe_s_qat_dis.yaml +++ b/example/auto_compression/detection/configs/ppyoloe_s_qat_dis.yaml @@ -8,27 +8,39 @@ Global: model_filename: model.pdmodel params_filename: model.pdiparams -Distillation: - alpha: 1.0 - loss: soft_label +# Distillation: +# alpha: 1.0 +# loss: soft_label -QuantAware: - onnx_format: true - use_pact: true - activation_quantize_type: 'moving_average_abs_max' - quantize_op_types: - - conv2d - - depthwise_conv2d +# QuantAware: +# onnx_format: true +# use_pact: true +# activation_quantize_type: 'moving_average_abs_max' +# quantize_op_types: +# - conv2d +# - depthwise_conv2d -TrainConfig: - train_iter: 5000 - eval_iter: 1000 - learning_rate: - type: CosineAnnealingDecay - learning_rate: 0.00003 - T_max: 6000 - optimizer_builder: - optimizer: - type: SGD - weight_decay: 4.0e-05 +# TrainConfig: +# train_iter: 5000 +# eval_iter: 1000 +# learning_rate: +# type: CosineAnnealingDecay +# learning_rate: 0.00003 +# T_max: 6000 +# optimizer_builder: +# optimizer: +# type: SGD +# weight_decay: 4.0e-05 +QuantPost: + batch_size: 32 + batch_nums: None + algo: 'hist' + hist_percent: 0.999 + bias_correct: False + recon_level: None + regions: None + epochs: 20 + lr: 0.1 + simulate_activation_quant: False + skip_tensor_list: None diff --git a/example/auto_compression/detection/configs/yolo_reader.yml b/example/auto_compression/detection/configs/yolo_reader.yml index d10614530..6e013c1b9 100644 --- a/example/auto_compression/detection/configs/yolo_reader.yml +++ b/example/auto_compression/detection/configs/yolo_reader.yml @@ -6,13 +6,13 @@ TrainDataset: !COCODataSet image_dir: train2017 anno_path: annotations/instances_train2017.json - dataset_dir: dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ EvalDataset: !COCODataSet image_dir: val2017 anno_path: annotations/instances_val2017.json - dataset_dir: dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ worker_num: 0 diff --git a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml index 9c9f58826..8244c90c3 100644 --- a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml +++ b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml @@ -6,11 +6,20 @@ Global: dataset: clue batch_size: 16 max_seq_length: 128 -TransformerPrune: - pruned_ratio: 0.25 -HyperParameterOptimization: + + +# 蒸馏 Distillation: -QuantPost: + teacher_model_dir: ./afqmc + teacher_model_filename: inference.pdmodel + teacher_params_filename: inference.pdiparams + +# 剪枝参数 +# 剪枝参数包括剪枝算法和裁剪度 +Prune: + prune_algo: transformer_pruner + pruned_ratio: 0.25 + TrainConfig: epochs: 6 eval_iter: 1070 @@ -20,3 +29,12 @@ TrainConfig: type: AdamW weight_decay: 0.01 origin_metric: 0.7403 + + +# 离线量化 +QuantPost: + activation_bits: 8 + quantize_op_types: + - conv2d + - depthwise_conv2d + weight_bits: 8 diff --git a/example/auto_compression/nlp/configs/uie/uie_base.yaml b/example/auto_compression/nlp/configs/uie/uie_base.yaml index 484f62899..36873084f 100644 --- a/example/auto_compression/nlp/configs/uie/uie_base.yaml +++ b/example/auto_compression/nlp/configs/uie/uie_base.yaml @@ -2,21 +2,24 @@ Global: model_dir: ./UIE model_filename: inference.pdmodel params_filename: inference.pdiparams - batch_size: 1 - max_seq_length: 512 - train_data: ./data/train.txt - dev_data: ./data/dev.txt -TrainConfig: - epochs: 200 - eval_iter: 100 - learning_rate: 1.0e-5 - optimizer_builder: - optimizer: - type: AdamW - weight_decay: 0.01 + task_name: afqmc + dataset: clue + batch_size: 16 + max_seq_length: 128 -QuantAware: - onnx_format: True -Distillation: - alpha: 1.0 - loss: l2 + +HyperParameterOptimization: + batch_num: + - 4 + - 16 + bias_correct: + - true + hist_percent: + - 0.999 + - 0.99999 + max_quant_count: 20 + ptq_algo: + - KL + - hist + weight_quantize_type: + - channel_wise_abs_max \ No newline at end of file diff --git a/example/auto_compression/nlp/run.py b/example/auto_compression/nlp/run.py index 1f6fa5403..5bfac56db 100644 --- a/example/auto_compression/nlp/run.py +++ b/example/auto_compression/nlp/run.py @@ -17,6 +17,8 @@ from paddlenlp.metrics import Mcc, PearsonAndSpearman from paddleslim.common import load_config from paddleslim.auto_compression.compressor import AutoCompression +import sys +sys.setrecursionlimit(1500) # 设置一个更高的限制,例如 1500 def argsparser(): diff --git a/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml b/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml index d3d6944c2..16a134c87 100644 --- a/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml +++ b/example/post_training_quantization/detection/configs/picodet_s_analysis.yaml @@ -1,12 +1,12 @@ input_list: ['image', 'scale_factor'] -model_dir: ./picodet_s_416_coco_lcnet/ +model_dir: ./picodet_s_416_coco_lcnet model_filename: model.pdmodel params_filename: model.pdiparams save_dir: ./analysis_results metric: COCO num_classes: 80 plot_hist: True -get_target_quant_model: False +get_target_quant_model: None target_metric: None PTQ: @@ -22,15 +22,15 @@ EvalDataset: !COCODataSet image_dir: val2017 anno_path: annotations/instances_val2017.json - dataset_dir: /dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ # Small Dataset to accelerate analysis # If not exist, delete the dict of FastEvalDataset -FastEvalDataset: - !COCODataSet - image_dir: val2017 - anno_path: annotations/small_instances_val2017.json - dataset_dir: /dataset/coco/ +# FastEvalDataset: +# !COCODataSet +# image_dir: val2017 +# anno_path: annotations/small_instances_val2017.json +# dataset_dir: /dataset/coco/ eval_height: &eval_height 416 diff --git a/example/post_training_quantization/detection/configs/picodet_s_analyzed_ptq.yaml b/example/post_training_quantization/detection/configs/picodet_s_analyzed_ptq.yaml index 54aa3cb9c..6c3ea4721 100644 --- a/example/post_training_quantization/detection/configs/picodet_s_analyzed_ptq.yaml +++ b/example/post_training_quantization/detection/configs/picodet_s_analyzed_ptq.yaml @@ -12,13 +12,13 @@ TrainDataset: !COCODataSet image_dir: train2017 anno_path: annotations/instances_train2017.json - dataset_dir: /paddle/dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ EvalDataset: !COCODataSet image_dir: val2017 anno_path: annotations/instances_val2017.json - dataset_dir: /paddle/dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ eval_height: &eval_height 416 eval_width: &eval_width 416 diff --git a/example/post_training_quantization/detection/configs/picodet_s_ptq.yaml b/example/post_training_quantization/detection/configs/picodet_s_ptq.yaml index 005c0d46c..a1c5cb70a 100644 --- a/example/post_training_quantization/detection/configs/picodet_s_ptq.yaml +++ b/example/post_training_quantization/detection/configs/picodet_s_ptq.yaml @@ -1,5 +1,5 @@ input_list: ['image', 'scale_factor'] -model_dir: ./picodet_s_416_coco_lcnet/ +model_dir: ./picodet_s_analyzed_ptq_out model_filename: model.pdmodel params_filename: model.pdiparams skip_tensor_list: None @@ -12,13 +12,13 @@ TrainDataset: !COCODataSet image_dir: train2017 anno_path: annotations/instances_train2017.json - dataset_dir: /dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ EvalDataset: !COCODataSet image_dir: val2017 anno_path: annotations/instances_val2017.json - dataset_dir: /dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ eval_height: &eval_height 416 eval_width: &eval_width 416 @@ -34,5 +34,5 @@ EvalReader: - Resize: {interp: 2, target_size: *eval_size, keep_ratio: False} - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} - Permute: {} - batch_size: 32 + batch_size: 16 diff --git a/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml b/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml index 3c8752652..5fcf7212d 100644 --- a/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml +++ b/example/post_training_quantization/detection/configs/ppyoloe_s_ptq.yaml @@ -1,4 +1,4 @@ -input_list: ['image'] +input_list: ['image', 'scale_factor'] arch: PPYOLOE # When export exclude_nms=True, need set arch: PPYOLOE model_dir: ./ppyoloe_crn_s_300e_coco model_filename: model.pdmodel @@ -12,13 +12,13 @@ TrainDataset: !COCODataSet image_dir: train2017 anno_path: annotations/instances_train2017.json - dataset_dir: /dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ EvalDataset: !COCODataSet image_dir: val2017 anno_path: annotations/instances_val2017.json - dataset_dir: /dataset/coco/ + dataset_dir: /work/GETR-Lite-paddle-new/inference/datasets/coco/ worker_num: 0 @@ -29,4 +29,4 @@ EvalReader: - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2} - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} - Permute: {} - batch_size: 32 \ No newline at end of file + batch_size: 16 \ No newline at end of file diff --git a/example/post_training_quantization/detection/eval.py b/example/post_training_quantization/detection/eval.py index f8e1342d5..47fe16225 100644 --- a/example/post_training_quantization/detection/eval.py +++ b/example/post_training_quantization/detection/eval.py @@ -97,10 +97,11 @@ def eval(): if k in config['input_list'].keys(): data_input[config['input_list'][k]] = np.array(v) - outs = exe.run(val_program, - feed=data_input, - fetch_list=fetch_targets, - return_numpy=False) + outs = exe.run( + val_program, + feed=data_input, + fetch_list=fetch_targets, + return_numpy=False) res = {} if 'arch' in config and config['arch'] == 'keypoint': res = keypoint_post_process(data, data_input, exe, val_program, @@ -112,6 +113,7 @@ def eval(): else: for out in outs: v = np.array(out) + # print("v",v) if len(v.shape) > 1: res['bbox'] = v else: @@ -130,9 +132,8 @@ def main(): dataset = config['EvalDataset'] global val_loader - val_loader = create('EvalReader')(config['EvalDataset'], - config['worker_num'], - return_list=True) + val_loader = create('EvalReader')( + config['EvalDataset'], config['worker_num'], return_list=True) metric = None if config['metric'] == 'COCO': clsid2catid = {v: k for k, v in dataset.catid2clsid.items()} diff --git a/example/post_training_quantization/pytorch_yolo_series/README.md b/example/post_training_quantization/pytorch_yolo_series/README.md index 4bb4d304f..63a7d96c1 100755 --- a/example/post_training_quantization/pytorch_yolo_series/README.md +++ b/example/post_training_quantization/pytorch_yolo_series/README.md @@ -122,7 +122,7 @@ python eval.py --config_path=./configs/yolov5s_ptq.yaml #### 3.6 提高离线量化精度 ###### 3.6.1 量化分析工具 -本节介绍如何使用量化分析工具提升离线量化精度。离线量化功能仅需使用少量数据,且使用简单、能快速得到量化模型,但往往会造成较大的精度损失。PaddleSlim提供量化分析工具,会使用接口```paddleslim.quant.AnalysisPTQ```,可视化展示出不适合量化的层,通过跳过这些层,提高离线量化模型精度。```paddleslim.quant.AnalysisPTQ```详解见[AnalysisPTQ.md](../../../docs/zh_cn/tutorials/quant/AnalysisPTQ.md)。 +本节介绍如何使用量化分析工具提升离线量化精度。离线量化功能仅需使用少量数据,且使用简单、能快速得到量化模型,但往往会造成较大的精度损失。PaddleSlim提供量化分析工具,会使用接口```paddleslim.quant.AnalysisPTQ```,可视化展示出不适合量化的层,通过跳过这些层,提高离线量化模型精度。```paddleslim.quant.AnalysisPTQ```详解见[AnalysisPTQ.md](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/quant/post_training_quantization.md)。 由于YOLOv6离线量化效果较差,以YOLOv6为例,量化分析工具具体使用方法如下: @@ -207,7 +207,70 @@ python fine_tune.py --config_path=./configs/yolov6s_fine_tune.yaml --simulate_ac ## 4.预测部署 预测部署可参考[YOLO系列模型自动压缩示例](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/auto_compression/pytorch_yolo_series) - - +量化模型在GPU上可以使用TensorRT进行加速,在CPU上可以使用MKLDNN进行加速。 +| 参数名 | 含义 | +| model_path | inference模型文件所在路径,该目录下需要有文件model.pdmodel和params.pdiparams两个文件 | +| dataset_dir | 指定COCO数据集的目录,这是存储数据集的根目录 | +| image_file | 如果只测试单张图片效果,直接根据image_file指定图片路径 | +| val_image_dir | COCO数据集中验证图像的目录名,默认为val2017 | +| val_anno_path | 指定COCO数据集的注释(annotation)文件路径,这是包含验证集标注信息的JSON文件,默认为annotations/instances_val2017.json | +| benchmark | 指定是否运行性能基准测试。如果设置为True,程序将会进行性能测试 | +| device | 使用GPU或者CPU预测,可选CPU/GPU/XPU,默认设置为GPU | +| use_trt | 是否使用TensorRT进行预测| +| use_mkldnn | 是否使用MKL-DNN加速库,注意use_mkldnn与use_gpu同时为True时,将忽略enable_mkldnn,而使用GPU预测| +| use_dynamic_shape | 是否使用动态形状(dynamic_shape)功能 | +| precision | fp32/fp16/int8| +| arch | 指定所使用的模型架构的名称,例如YOLOv5 | +| img_shape | 指定模型输入的图像尺寸 | +| batch_size | 指定模型输入的批处理大小 | +| use_mkldnn | 指定是否使用MKLDNN加速(主要针对CPU)| +| cpu_threads | 指定在CPU上使用的线程数 | + +首先,我们拥有的yolov6.onnx,我们需要把ONNX模型转成paddle模型,具体参考使用[X2Paddle迁移推理模型](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/model_convert/convert_with_x2paddle_cn.html#x2paddle) +- 安装X2Paddle +方式一:pip 安装 +```shell +pip install X2Paddle==1.3.9 +``` +方式二:源码安装 +```shell +git clone https://github.com/PaddlePaddle/X2Paddle.git +cd X2Paddle +python setup.py install +``` +使用命令将YOLOv6.onnx模型转换成paddle模型 +```shell +x2paddle --framework=onnx --model=yolov6s.onnx --save_dir=yolov6_model +``` +- TensorRT Python部署 +使用[paddle_inference_eval.py](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py)部署 +```shell +python paddle_inference_eval.py --model_path=yolov6_model/inference_model --dataset_dir=datasets/coco --use_trt=True --precision=fp32 --arch=YOLOv6 +``` +执行int8量化 +```shell +python paddle_inference_eval.py --model_path=yolov6s_ptq_out --dataset_dir==datasets/coco --use_trt=True --precision=int8 --arch=YOLOv6 +``` +- C++部署 +具体可参考[运行PP-YOLOE-l目标检测模型样例](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B/gpu/ppyoloe_crn_l) +将compile.sh中DEMO_NAME修改为yolov6_test,并且将ppyoloe_crn_l.cc修改为yolov6_test.cc,根据环境修改相关配置库 +运行bash compile.sh编译样例。 +- 运行样例 +-使用原生GPU运行样例(将ONNX模型转成的paddle模型复制到Paddle-Inference-demo/c++/gpu/ppyoloe_crn_l/目录下) +```shell +./build/yolov6_test --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams +``` +- 使用TensorRT FP32运行样例 +```shell +./build/yolov6_test --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_fp32 +``` +- 使用TensorRT FP16运行样例 +```shell +./build/yolov6_test --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_fp16 +``` +- 使用TensorRT INT8运行样例 +```shell +./build/yolov6_test --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_int8 +``` ## 5.FAQ - 如果想对模型进行自动压缩,可进入[YOLO系列模型自动压缩示例](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/auto_compression/pytorch_yolo_series)中进行实验。 diff --git a/setup.py b/setup.py index bc2842802..a2f84f961 100644 --- a/setup.py +++ b/setup.py @@ -22,17 +22,18 @@ from setuptools import find_packages from setuptools import setup -if 'develop' in subprocess.getoutput('git branch'): - slim_version = '0.0.0_dev' -else: - tag_list = subprocess.getoutput('git tag').split('\n') - if 'rc' in tag_list[-1]: - if tag_list[-1].split('rc')[0] in tag_list[-2]: - slim_version = tag_list[-2] - else: - slim_version = tag_list[-1] - else: - slim_version = tag_list[-1] +# if 'develop' in subprocess.getoutput('git branch'): +# slim_version = '0.0.0_dev' +# else: +# tag_list = subprocess.getoutput('git tag').split('\n') +# if 'rc' in tag_list[-1]: +# if tag_list[-1].split('rc')[0] in tag_list[-2]: +# slim_version = tag_list[-2] +# else: +# slim_version = tag_list[-1] +# else: +# slim_version = tag_list[-1] +slim_version = '2.6.0' with open('./requirements.txt') as f: setup_requires = f.read().splitlines()