diff --git a/example/auto_compression/nlp/README.md b/example/auto_compression/nlp/README.md index c98f1987e..da35eebcc 100644 --- a/example/auto_compression/nlp/README.md +++ b/example/auto_compression/nlp/README.md @@ -56,16 +56,16 @@ #### 3.1 准备环境 - python >= 3.6 -- PaddlePaddle >= 2.4 (可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载安装) -- PaddleSlim >= 2.4 -- PaddleNLP >= 2.3 +- PaddlePaddle ==2.5 (可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载安装) +- PaddleSlim ==2.5 +- PaddleNLP ==2.6 安装paddlepaddle: ```shell # CPU -pip install paddlepaddle==2.4.1 +pip install paddlepaddle==2.5.0 # GPU 以Ubuntu、CUDA 11.2为例 -python -m pip install paddlepaddle-gpu==2.4.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +python -m pip install paddlepaddle-gpu==2.5.0.post116 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html ``` 安装paddleslim: @@ -95,7 +95,6 @@ pip install paddlenlp |:------:|:------:|:------:|:------:|:------:|:-----------:|:------:|:------:| | PP-MiniLM | [afqmc](https://bj.bcebos.com/v1/paddle-slim-models/act/afqmc.tar) | [tnews](https://bj.bcebos.com/v1/paddle-slim-models/act/tnews.tar) | [iflytek](https://bj.bcebos.com/v1/paddle-slim-models/act/iflytek.tar) | [cmnli](https://bj.bcebos.com/v1/paddle-slim-models/act/cmnli.tar) | [ ocnli](https://bj.bcebos.com/v1/paddle-slim-models/act/ocnli.tar) | [cluewsc2020](https://bj.bcebos.com/v1/paddle-slim-models/act/cluewsc.tar) | [csl](https://bj.bcebos.com/v1/paddle-slim-models/act/csl.tar) | | ERNIE 3.0-Medium | [afqmc](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/AFQMC.tar) | [tnews](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/TNEWS.tar) | [iflytek](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/IFLYTEK.tar) | [cmnli](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CMNLI.tar) | [ocnli](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/OCNLI.tar) | [cluewsc2020](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CLUEWSC2020.tar) | [csl](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CSL.tar) | -| UIE-base | [报销工单](https://bj.bcebos.com/v1/paddle-slim-models/act/uie_base.tar) | 从上表获得模型超链接, 并用以下命令下载推理模型文件: @@ -119,11 +118,6 @@ export CUDA_VISIBLE_DEVICES=0 python run.py --config_path='./configs/pp-minilm/auto/afqmc.yaml' --save_dir='./save_afqmc_pruned/' ``` -自动压缩UIE系列模型需要使用 run_uie.py 脚本启动,会使用接口```paddleslim.auto_compression.AutoCompression```对模型进行自动压缩。配置config文件中训练部分的参数,将任务名称、模型类型、数据集名称、压缩参数传入,配置完成后便可对模型进行蒸馏量化训练。 -```shell -export CUDA_VISIBLE_DEVICES=0 -python run_uie.py --config_path='./configs/uie/uie_base.yaml' --save_dir='./save_uie_qat/' -``` 如仅需验证模型精度,或验证压缩之后模型精度,在启动```run.py```脚本时,将配置文件中模型文件夹 ```model_dir``` 改为压缩之后保存的文件夹路径 ```./save_afqmc_pruned``` ,命令加上```--eval True```即可: ```shell @@ -217,8 +211,6 @@ QuantPost: - TensorRT预测: -环境配置:如果使用 TesorRT 预测引擎,需安装 ```WITH_TRT=ON``` 的Paddle,下载地址:[Python预测库](https://paddleinference.paddlepaddle.org.cn/master/user_guides/download_lib.html#python) - 首先下载量化好的模型: ```shell wget https://bj.bcebos.com/v1/paddle-slim-models/act/save_ppminilm_afqmc_new_calib.tar @@ -227,10 +219,30 @@ tar -xf save_ppminilm_afqmc_new_calib.tar ```shell python paddle_inference_eval.py \ - --model_path=save_ernie3_afqmc_new_cablib \ + --model_path=save_ppminilm_afqmc_new_calib \ + --model_filename=inference.pdmodel \ + --params_filename=inference.pdiparams \ + --task_name='afqmc' \ + --use_trt \ + --precision=int8 +``` + +- ERNIE 3.0-Medium: +```shell +python paddle_inference_eval.py \ + --model_path=TNEWS \ --model_filename=infer.pdmodel \ --params_filename=infer.pdiparams \ - --task_name='afqmc' \ + --task_name='tnews' \ + --use_trt \ + --precision=fp32 +``` +```shell +python paddle_inference_eval.py \ + --model_path=save_tnews_pruned \ + --model_filename=infer.pdmodel \ + --params_filename=infer.pdiparams \ + --task_name='tnews' \ --use_trt \ --precision=int8 ``` @@ -239,9 +251,9 @@ python paddle_inference_eval.py \ ```shell python paddle_inference_eval.py \ - --model_path=save_ernie3_afqmc_new_cablib \ - --model_filename=infer.pdmodel \ - --params_filename=infer.pdiparams \ + --model_path=save_ppminilm_afqmc_new_calib \ + --model_filename=inference.pdmodel \ + --params_filename=inference.pdiparams \ --task_name='afqmc' \ --device=cpu \ --use_mkldnn=True \ diff --git a/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml b/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml index 49093ab87..b90da628a 100644 --- a/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml +++ b/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml @@ -6,12 +6,17 @@ Global: dataset: clue batch_size: 16 max_seq_length: 128 -TrainConfig: - epochs: 6 - eval_iter: 1110 - learning_rate: 2.0e-5 - optimizer_builder: - optimizer: - type: AdamW - weight_decay: 0.01 - origin_metric: 0.5700 + +# 剪枝 +Prune: + prune_algo: transformer_pruner + pruned_ratio: 0.25 + +# 离线量化 +QuantPost: + activation_bits: 8 + quantize_op_types: + - depthwise_conv2d + - conv2d + weight_bits: 8 + \ No newline at end of file diff --git a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml index 9c9f58826..fdf65673b 100644 --- a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml +++ b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml @@ -6,17 +6,11 @@ Global: dataset: clue batch_size: 16 max_seq_length: 128 -TransformerPrune: - pruned_ratio: 0.25 -HyperParameterOptimization: -Distillation: + +#离线量化 QuantPost: -TrainConfig: - epochs: 6 - eval_iter: 1070 - learning_rate: 2.0e-5 - optimizer_builder: - optimizer: - type: AdamW - weight_decay: 0.01 - origin_metric: 0.7403 + activation_bits: 8 + quantize_op_types: + - conv2d + - depthwise_conv2d + weight_bits: 8 \ No newline at end of file diff --git a/example/auto_compression/nlp/paddle_inference_eval.py b/example/auto_compression/nlp/paddle_inference_eval.py index f48e20698..073f032e5 100644 --- a/example/auto_compression/nlp/paddle_inference_eval.py +++ b/example/auto_compression/nlp/paddle_inference_eval.py @@ -91,7 +91,8 @@ def parse_args(): "--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " + help= + "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--perf_warmup_steps", @@ -107,7 +108,8 @@ def parse_args(): type=str, default="fp32", choices=["fp32", "fp16", "int8"], - help="The precision of inference. It can be 'fp32', 'fp16' or 'int8'. Default is 'fp16'.", + help= + "The precision of inference. It can be 'fp32', 'fp16' or 'int8'. Default is 'fp16'.", ) parser.add_argument( "--use_mkldnn", @@ -156,8 +158,7 @@ def _convert_example(example, } elif "target" in example: # wsc text, query, pronoun, query_idx, pronoun_idx = ( - example["text"], - example["target"]["span1_text"], + example["text"], example["target"]["span1_text"], example["target"]["span2_text"], example["target"]["span1_index"], example["target"]["span2_index"], ) @@ -209,6 +210,12 @@ def create_predictor(cls, args): config = paddle.inference.Config( os.path.join(args.model_path, args.model_filename), os.path.join(args.model_path, args.params_filename)) + config.switch_ir_debug(True) + # 适用于ERNIE 3.0-Medium模型 + # config.exp_disable_tensorrt_ops(["elementwise_add"]) + # config.exp_disable_tensorrt_ops(["fused_embedding_eltwise_layernorm"]) + # config.exp_disable_tensorrt_ops(["tmp_3"]) + if args.device == "gpu": # set GPU configs accordingly config.enable_use_gpu(100, 0) @@ -239,8 +246,8 @@ def create_predictor(cls, args): dynamic_shape_file = os.path.join(args.model_path, "dynamic_shape.txt") if os.path.exists(dynamic_shape_file): - config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file, - True) + config.enable_tuned_tensorrt_dynamic_shape( + dynamic_shape_file, True) print("trt set dynamic shape done!") else: config.collect_shape_range_info(dynamic_shape_file) @@ -365,4 +372,4 @@ def main(): if __name__ == "__main__": paddle.set_device("cpu") - main() + main() \ No newline at end of file diff --git a/paddleslim/quant/advanced/gptq.py b/paddleslim/quant/advanced/gptq.py index 96566858f..f5b73971f 100644 --- a/paddleslim/quant/advanced/gptq.py +++ b/paddleslim/quant/advanced/gptq.py @@ -182,4 +182,4 @@ def fasterquant(self, self.quantized = True del H, Q, Hinv, W, Losses - paddle.device.cuda.empty_cache() + paddle.device.cuda.empty_cache() \ No newline at end of file