From 0bf10c913cf02fe5d7d87cb00a52b280436873d6 Mon Sep 17 00:00:00 2001 From: whs Date: Wed, 14 Jun 2023 10:11:46 +0800 Subject: [PATCH 1/3] Change in_dygraph_mode to in_dynamic_mode (#1764) --- paddleslim/nas/ofa/layers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddleslim/nas/ofa/layers.py b/paddleslim/nas/ofa/layers.py index f8edd9aab..1477b8070 100644 --- a/paddleslim/nas/ofa/layers.py +++ b/paddleslim/nas/ofa/layers.py @@ -20,7 +20,6 @@ from ...common import get_logger from .utils.utils import compute_start_end, get_same_padding, convert_to_list from .layers_base import * -from paddle.framework import in_dygraph_mode __all__ = [ 'SuperConv2D', 'SuperConv2DTranspose', 'SuperSeparableConv2D', @@ -985,7 +984,7 @@ def forward(self, input): "use_global_stats", self._use_global_stats, "trainable_statistics", trainable_statistics) - if in_dygraph_mode(): + if paddle.in_dynamic_mode(): paddle_compile = os.environ.get("paddle_compile") if feature_dim != self._mean.shape[0]: if not paddle_compile or "Develop" in paddle_compile: From a5ba49f5f31eff2f1babf7e28ee15f11cfd1d647 Mon Sep 17 00:00:00 2001 From: lizexu123 <39205361+lizexu123@users.noreply.github.com> Date: Tue, 30 Jan 2024 10:55:20 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=87=AA=E7=84=B6?= =?UTF-8?q?=E8=AF=AD=E8=A8=80=E5=A4=84=E7=90=86=E6=A8=A1=E5=9E=8B=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E5=8E=8B=E7=BC=A9=E7=A4=BA=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- example/auto_compression/nlp/README.md | 48 +++-- .../nlp/configs/ernie3.0/tnews.yaml | 23 ++- .../nlp/configs/pp-minilm/auto/afqmc.yaml | 20 +- .../nlp/paddle_inference_eval.py | 21 ++- paddleslim/quant/advanced/auto_clip.py | 172 ++++++++++++++++++ paddleslim/quant/advanced/gptq.py | 2 +- 6 files changed, 238 insertions(+), 48 deletions(-) create mode 100644 paddleslim/quant/advanced/auto_clip.py diff --git a/example/auto_compression/nlp/README.md b/example/auto_compression/nlp/README.md index c98f1987e..da35eebcc 100644 --- a/example/auto_compression/nlp/README.md +++ b/example/auto_compression/nlp/README.md @@ -56,16 +56,16 @@ #### 3.1 准备环境 - python >= 3.6 -- PaddlePaddle >= 2.4 (可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载安装) -- PaddleSlim >= 2.4 -- PaddleNLP >= 2.3 +- PaddlePaddle ==2.5 (可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载安装) +- PaddleSlim ==2.5 +- PaddleNLP ==2.6 安装paddlepaddle: ```shell # CPU -pip install paddlepaddle==2.4.1 +pip install paddlepaddle==2.5.0 # GPU 以Ubuntu、CUDA 11.2为例 -python -m pip install paddlepaddle-gpu==2.4.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +python -m pip install paddlepaddle-gpu==2.5.0.post116 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html ``` 安装paddleslim: @@ -95,7 +95,6 @@ pip install paddlenlp |:------:|:------:|:------:|:------:|:------:|:-----------:|:------:|:------:| | PP-MiniLM | [afqmc](https://bj.bcebos.com/v1/paddle-slim-models/act/afqmc.tar) | [tnews](https://bj.bcebos.com/v1/paddle-slim-models/act/tnews.tar) | [iflytek](https://bj.bcebos.com/v1/paddle-slim-models/act/iflytek.tar) | [cmnli](https://bj.bcebos.com/v1/paddle-slim-models/act/cmnli.tar) | [ ocnli](https://bj.bcebos.com/v1/paddle-slim-models/act/ocnli.tar) | [cluewsc2020](https://bj.bcebos.com/v1/paddle-slim-models/act/cluewsc.tar) | [csl](https://bj.bcebos.com/v1/paddle-slim-models/act/csl.tar) | | ERNIE 3.0-Medium | [afqmc](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/AFQMC.tar) | [tnews](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/TNEWS.tar) | [iflytek](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/IFLYTEK.tar) | [cmnli](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CMNLI.tar) | [ocnli](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/OCNLI.tar) | [cluewsc2020](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CLUEWSC2020.tar) | [csl](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CSL.tar) | -| UIE-base | [报销工单](https://bj.bcebos.com/v1/paddle-slim-models/act/uie_base.tar) | 从上表获得模型超链接, 并用以下命令下载推理模型文件: @@ -119,11 +118,6 @@ export CUDA_VISIBLE_DEVICES=0 python run.py --config_path='./configs/pp-minilm/auto/afqmc.yaml' --save_dir='./save_afqmc_pruned/' ``` -自动压缩UIE系列模型需要使用 run_uie.py 脚本启动,会使用接口```paddleslim.auto_compression.AutoCompression```对模型进行自动压缩。配置config文件中训练部分的参数,将任务名称、模型类型、数据集名称、压缩参数传入,配置完成后便可对模型进行蒸馏量化训练。 -```shell -export CUDA_VISIBLE_DEVICES=0 -python run_uie.py --config_path='./configs/uie/uie_base.yaml' --save_dir='./save_uie_qat/' -``` 如仅需验证模型精度,或验证压缩之后模型精度,在启动```run.py```脚本时,将配置文件中模型文件夹 ```model_dir``` 改为压缩之后保存的文件夹路径 ```./save_afqmc_pruned``` ,命令加上```--eval True```即可: ```shell @@ -217,8 +211,6 @@ QuantPost: - TensorRT预测: -环境配置:如果使用 TesorRT 预测引擎,需安装 ```WITH_TRT=ON``` 的Paddle,下载地址:[Python预测库](https://paddleinference.paddlepaddle.org.cn/master/user_guides/download_lib.html#python) - 首先下载量化好的模型: ```shell wget https://bj.bcebos.com/v1/paddle-slim-models/act/save_ppminilm_afqmc_new_calib.tar @@ -227,10 +219,30 @@ tar -xf save_ppminilm_afqmc_new_calib.tar ```shell python paddle_inference_eval.py \ - --model_path=save_ernie3_afqmc_new_cablib \ + --model_path=save_ppminilm_afqmc_new_calib \ + --model_filename=inference.pdmodel \ + --params_filename=inference.pdiparams \ + --task_name='afqmc' \ + --use_trt \ + --precision=int8 +``` + +- ERNIE 3.0-Medium: +```shell +python paddle_inference_eval.py \ + --model_path=TNEWS \ --model_filename=infer.pdmodel \ --params_filename=infer.pdiparams \ - --task_name='afqmc' \ + --task_name='tnews' \ + --use_trt \ + --precision=fp32 +``` +```shell +python paddle_inference_eval.py \ + --model_path=save_tnews_pruned \ + --model_filename=infer.pdmodel \ + --params_filename=infer.pdiparams \ + --task_name='tnews' \ --use_trt \ --precision=int8 ``` @@ -239,9 +251,9 @@ python paddle_inference_eval.py \ ```shell python paddle_inference_eval.py \ - --model_path=save_ernie3_afqmc_new_cablib \ - --model_filename=infer.pdmodel \ - --params_filename=infer.pdiparams \ + --model_path=save_ppminilm_afqmc_new_calib \ + --model_filename=inference.pdmodel \ + --params_filename=inference.pdiparams \ --task_name='afqmc' \ --device=cpu \ --use_mkldnn=True \ diff --git a/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml b/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml index 49093ab87..b90da628a 100644 --- a/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml +++ b/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml @@ -6,12 +6,17 @@ Global: dataset: clue batch_size: 16 max_seq_length: 128 -TrainConfig: - epochs: 6 - eval_iter: 1110 - learning_rate: 2.0e-5 - optimizer_builder: - optimizer: - type: AdamW - weight_decay: 0.01 - origin_metric: 0.5700 + +# 剪枝 +Prune: + prune_algo: transformer_pruner + pruned_ratio: 0.25 + +# 离线量化 +QuantPost: + activation_bits: 8 + quantize_op_types: + - depthwise_conv2d + - conv2d + weight_bits: 8 + \ No newline at end of file diff --git a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml index 9c9f58826..fdf65673b 100644 --- a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml +++ b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml @@ -6,17 +6,11 @@ Global: dataset: clue batch_size: 16 max_seq_length: 128 -TransformerPrune: - pruned_ratio: 0.25 -HyperParameterOptimization: -Distillation: + +#离线量化 QuantPost: -TrainConfig: - epochs: 6 - eval_iter: 1070 - learning_rate: 2.0e-5 - optimizer_builder: - optimizer: - type: AdamW - weight_decay: 0.01 - origin_metric: 0.7403 + activation_bits: 8 + quantize_op_types: + - conv2d + - depthwise_conv2d + weight_bits: 8 \ No newline at end of file diff --git a/example/auto_compression/nlp/paddle_inference_eval.py b/example/auto_compression/nlp/paddle_inference_eval.py index f48e20698..073f032e5 100644 --- a/example/auto_compression/nlp/paddle_inference_eval.py +++ b/example/auto_compression/nlp/paddle_inference_eval.py @@ -91,7 +91,8 @@ def parse_args(): "--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " + help= + "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--perf_warmup_steps", @@ -107,7 +108,8 @@ def parse_args(): type=str, default="fp32", choices=["fp32", "fp16", "int8"], - help="The precision of inference. It can be 'fp32', 'fp16' or 'int8'. Default is 'fp16'.", + help= + "The precision of inference. It can be 'fp32', 'fp16' or 'int8'. Default is 'fp16'.", ) parser.add_argument( "--use_mkldnn", @@ -156,8 +158,7 @@ def _convert_example(example, } elif "target" in example: # wsc text, query, pronoun, query_idx, pronoun_idx = ( - example["text"], - example["target"]["span1_text"], + example["text"], example["target"]["span1_text"], example["target"]["span2_text"], example["target"]["span1_index"], example["target"]["span2_index"], ) @@ -209,6 +210,12 @@ def create_predictor(cls, args): config = paddle.inference.Config( os.path.join(args.model_path, args.model_filename), os.path.join(args.model_path, args.params_filename)) + config.switch_ir_debug(True) + # 适用于ERNIE 3.0-Medium模型 + # config.exp_disable_tensorrt_ops(["elementwise_add"]) + # config.exp_disable_tensorrt_ops(["fused_embedding_eltwise_layernorm"]) + # config.exp_disable_tensorrt_ops(["tmp_3"]) + if args.device == "gpu": # set GPU configs accordingly config.enable_use_gpu(100, 0) @@ -239,8 +246,8 @@ def create_predictor(cls, args): dynamic_shape_file = os.path.join(args.model_path, "dynamic_shape.txt") if os.path.exists(dynamic_shape_file): - config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file, - True) + config.enable_tuned_tensorrt_dynamic_shape( + dynamic_shape_file, True) print("trt set dynamic shape done!") else: config.collect_shape_range_info(dynamic_shape_file) @@ -365,4 +372,4 @@ def main(): if __name__ == "__main__": paddle.set_device("cpu") - main() + main() \ No newline at end of file diff --git a/paddleslim/quant/advanced/auto_clip.py b/paddleslim/quant/advanced/auto_clip.py new file mode 100644 index 000000000..ac7166ed7 --- /dev/null +++ b/paddleslim/quant/advanced/auto_clip.py @@ -0,0 +1,172 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +AutoClip. +""" +import paddle +import paddle.nn as nn +import numpy as np +from .utils import fake_quant +from .metrics import mse_loss +from paddle.distributed.fleet.meta_parallel import ( + ColumnParallelLinear, + RowParallelLinear, ) +__all__ = ['AutoClip'] + + +class AutoClip(nn.Layer): + """ + AutoClip from AWQ[https://arxiv.org/abs/2306.00978] + """ + + def __init__( + self, + model, + weight_bits=4, + weight_quant_method='groupwise', + loss_function=mse_loss, + sample_function=None, + n_grid=20, + max_shrink=0.5, + n_sample_token=512, + group_size=128, ): + super(AutoClip, self).__init__() + self.model = model + self.weight_bits = weight_bits + self.weight_method = weight_quant_method + self.loss_function = loss_function + self.n_grid = n_grid + self.max_shrink = max_shrink + self.n_sample_token = n_sample_token + self.bnt = (1 << (self.weight_bits - 1)) - 1 + self.sampled_inputs = {} + self.sample_function = sample_function + self.group_size = group_size + + self._apply_hook() + + def _apply_hook(self): + self._forward_hook_list = [] + for _, sub_layer in self.model.named_sublayers(): + if type(sub_layer) in [ + ColumnParallelLinear, RowParallelLinear, paddle.nn.Linear + ]: + forward_pre_hook_handle = sub_layer.register_forward_pre_hook( + self._forward_pre_hook) + self._forward_hook_list.append(forward_pre_hook_handle) + + def _forward_pre_hook(self, layer, input): + self._sample_scale(input, layer.full_name()) + return input + + def _sample_scale(self, input, name): + input = input[0] if type(input) == tuple else input + input.stop_gradient = True + if name not in self.sampled_inputs: + self.sampled_inputs[name] = input + else: + if self.sample_function is not None: + self.sampled_inputs[name] = self.sample_function.sample( + input, self.sampled_inputs[name], name) + else: + self.sampled_inputs[name] = input + + def auto_clip(self, group_size=128, oc_batch_size=256): + """ + search clip scale for each layer and update the layer's weight + """ + for sub_name, sub_layer in self.model.named_sublayers(): + name = sub_layer.full_name() + if name not in self.sampled_inputs or 'out_linear' in sub_name: + continue + + weight = sub_layer.weight.cast('float16') + weight_t = paddle.transpose(weight, perm=[1, 0]) + x = self.sampled_inputs[name].cast('float16') + print('AutoClipping', sub_name, name, x.shape, weight.shape) + x = x.reshape([-1, x.shape[-1]]) + x = x.reshape([1, x.shape[0], -1, group_size]) + x = x[:, 0::x.shape[1] // self.n_sample_token] + weight_t = weight_t.reshape([weight_t.shape[0], 1, -1, group_size]) + oc_batch_size = oc_batch_size if weight_t.shape[ + 0] % oc_batch_size == 0 else 128 # prevent OOM + assert weight_t.shape[0] % oc_batch_size == 0 + + w_all = weight_t + best_max_val_all = [] + + for i_b in range(weight_t.shape[0] // oc_batch_size): + w = w_all[i_b * oc_batch_size:(i_b + 1) * oc_batch_size] + + org_max_val = w.abs().max( + axis=-1, keepdim=True) # co, 1, n_group, 1 + best_max_val = org_max_val.clone() + min_errs = paddle.ones_like(org_max_val, dtype='float16') * 1e9 + org_out = (x * w).sum(axis=-1) # co, n_token, n_group + for i_s in range(int(self.max_shrink * self.n_grid)): + max_val = org_max_val * (1 - i_s / self.n_grid) + max_val_tmp = max_val + cur_w = paddle.where(w > max_val_tmp, max_val_tmp, w) + cur_w = paddle.where(cur_w < -max_val_tmp, -max_val_tmp, + cur_w) + org_w_shape = cur_w.shape + cur_w_r = cur_w.reshape([-1, + self.group_size]).transpose([1, 0]) + quant_dequant_weight = fake_quant( + cur_w_r, method='abs_max_channel_wise', weight_bits=4) + quant_dequant_weight = quant_dequant_weight.transpose( + [1, 0]).reshape(org_w_shape) + cur_out = (x * quant_dequant_weight).sum(axis=-1) + # co, 1, n_group, 1 + tmp = (cur_out - org_out).detach().clone() + err = paddle.pow(tmp, + 2).mean(axis=1).reshape(min_errs.shape) + print('block {} search s {} err {}'.format( + i_b, i_s, err.mean().item())) + del cur_w, cur_out, quant_dequant_weight, tmp, cur_w_r + paddle.device.cuda.empty_cache() + + cur_best_idx = paddle.where(err < min_errs) + if cur_best_idx[0].shape[0] != 0: + min_errs[cur_best_idx] = err[cur_best_idx] + best_max_val[cur_best_idx] = max_val[cur_best_idx] + best_max_val_all.append(best_max_val) + + del org_out, org_max_val, min_errs, best_max_val, err, cur_best_idx, max_val_tmp, max_val, w + paddle.device.cuda.empty_cache() + + best_max_val = paddle.concat(best_max_val_all, axis=0) + best_max_val = paddle.squeeze(best_max_val, axis=1) + for param in sub_layer.parameters(include_sublayers=False): + if 'w_0' in param.name: + param_tmp = param.transpose(perm=[1, 0]).cast('float16') + tmp_shape = param_tmp.shape + param_tmp = param_tmp.reshape( + [best_max_val.shape[0], best_max_val.shape[1], -1]) + best_max_val = paddle.tile( + best_max_val, repeat_times=(1, 1, param_tmp.shape[-1])) + param_tmp = paddle.where(param_tmp > best_max_val, + best_max_val, param_tmp) + param_tmp = paddle.where(param_tmp < -best_max_val, + -best_max_val, param_tmp) + param_tmp = param_tmp.reshape(tmp_shape).cast(param.dtype) + param_tmp = param_tmp.transpose(perm=[1, 0]) + paddle.assign(param_tmp, output=param) + del param_tmp + paddle.device.cuda.empty_cache() + break + + del best_max_val, weight_t, x, weight, self.sampled_inputs[ + name], w_all, best_max_val_all + paddle.device.cuda.empty_cache() diff --git a/paddleslim/quant/advanced/gptq.py b/paddleslim/quant/advanced/gptq.py index 96566858f..f5b73971f 100644 --- a/paddleslim/quant/advanced/gptq.py +++ b/paddleslim/quant/advanced/gptq.py @@ -182,4 +182,4 @@ def fasterquant(self, self.quantized = True del H, Q, Hinv, W, Losses - paddle.device.cuda.empty_cache() + paddle.device.cuda.empty_cache() \ No newline at end of file From ec0f5a30def49fec65777d888e155a8dbd280801 Mon Sep 17 00:00:00 2001 From: lizexu123 <39205361+lizexu123@users.noreply.github.com> Date: Tue, 30 Jan 2024 10:55:20 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E8=87=AA=E7=84=B6=E8=AF=AD=E8=A8=80?= =?UTF-8?q?=E5=A4=84=E7=90=86=E6=A8=A1=E5=9E=8B=E8=87=AA=E5=8A=A8=E5=8E=8B?= =?UTF-8?q?=E7=BC=A9=E7=A4=BA=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddleslim/quant/advanced/auto_clip.py | 172 ------------------------- 1 file changed, 172 deletions(-) delete mode 100644 paddleslim/quant/advanced/auto_clip.py diff --git a/paddleslim/quant/advanced/auto_clip.py b/paddleslim/quant/advanced/auto_clip.py deleted file mode 100644 index ac7166ed7..000000000 --- a/paddleslim/quant/advanced/auto_clip.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -AutoClip. -""" -import paddle -import paddle.nn as nn -import numpy as np -from .utils import fake_quant -from .metrics import mse_loss -from paddle.distributed.fleet.meta_parallel import ( - ColumnParallelLinear, - RowParallelLinear, ) -__all__ = ['AutoClip'] - - -class AutoClip(nn.Layer): - """ - AutoClip from AWQ[https://arxiv.org/abs/2306.00978] - """ - - def __init__( - self, - model, - weight_bits=4, - weight_quant_method='groupwise', - loss_function=mse_loss, - sample_function=None, - n_grid=20, - max_shrink=0.5, - n_sample_token=512, - group_size=128, ): - super(AutoClip, self).__init__() - self.model = model - self.weight_bits = weight_bits - self.weight_method = weight_quant_method - self.loss_function = loss_function - self.n_grid = n_grid - self.max_shrink = max_shrink - self.n_sample_token = n_sample_token - self.bnt = (1 << (self.weight_bits - 1)) - 1 - self.sampled_inputs = {} - self.sample_function = sample_function - self.group_size = group_size - - self._apply_hook() - - def _apply_hook(self): - self._forward_hook_list = [] - for _, sub_layer in self.model.named_sublayers(): - if type(sub_layer) in [ - ColumnParallelLinear, RowParallelLinear, paddle.nn.Linear - ]: - forward_pre_hook_handle = sub_layer.register_forward_pre_hook( - self._forward_pre_hook) - self._forward_hook_list.append(forward_pre_hook_handle) - - def _forward_pre_hook(self, layer, input): - self._sample_scale(input, layer.full_name()) - return input - - def _sample_scale(self, input, name): - input = input[0] if type(input) == tuple else input - input.stop_gradient = True - if name not in self.sampled_inputs: - self.sampled_inputs[name] = input - else: - if self.sample_function is not None: - self.sampled_inputs[name] = self.sample_function.sample( - input, self.sampled_inputs[name], name) - else: - self.sampled_inputs[name] = input - - def auto_clip(self, group_size=128, oc_batch_size=256): - """ - search clip scale for each layer and update the layer's weight - """ - for sub_name, sub_layer in self.model.named_sublayers(): - name = sub_layer.full_name() - if name not in self.sampled_inputs or 'out_linear' in sub_name: - continue - - weight = sub_layer.weight.cast('float16') - weight_t = paddle.transpose(weight, perm=[1, 0]) - x = self.sampled_inputs[name].cast('float16') - print('AutoClipping', sub_name, name, x.shape, weight.shape) - x = x.reshape([-1, x.shape[-1]]) - x = x.reshape([1, x.shape[0], -1, group_size]) - x = x[:, 0::x.shape[1] // self.n_sample_token] - weight_t = weight_t.reshape([weight_t.shape[0], 1, -1, group_size]) - oc_batch_size = oc_batch_size if weight_t.shape[ - 0] % oc_batch_size == 0 else 128 # prevent OOM - assert weight_t.shape[0] % oc_batch_size == 0 - - w_all = weight_t - best_max_val_all = [] - - for i_b in range(weight_t.shape[0] // oc_batch_size): - w = w_all[i_b * oc_batch_size:(i_b + 1) * oc_batch_size] - - org_max_val = w.abs().max( - axis=-1, keepdim=True) # co, 1, n_group, 1 - best_max_val = org_max_val.clone() - min_errs = paddle.ones_like(org_max_val, dtype='float16') * 1e9 - org_out = (x * w).sum(axis=-1) # co, n_token, n_group - for i_s in range(int(self.max_shrink * self.n_grid)): - max_val = org_max_val * (1 - i_s / self.n_grid) - max_val_tmp = max_val - cur_w = paddle.where(w > max_val_tmp, max_val_tmp, w) - cur_w = paddle.where(cur_w < -max_val_tmp, -max_val_tmp, - cur_w) - org_w_shape = cur_w.shape - cur_w_r = cur_w.reshape([-1, - self.group_size]).transpose([1, 0]) - quant_dequant_weight = fake_quant( - cur_w_r, method='abs_max_channel_wise', weight_bits=4) - quant_dequant_weight = quant_dequant_weight.transpose( - [1, 0]).reshape(org_w_shape) - cur_out = (x * quant_dequant_weight).sum(axis=-1) - # co, 1, n_group, 1 - tmp = (cur_out - org_out).detach().clone() - err = paddle.pow(tmp, - 2).mean(axis=1).reshape(min_errs.shape) - print('block {} search s {} err {}'.format( - i_b, i_s, err.mean().item())) - del cur_w, cur_out, quant_dequant_weight, tmp, cur_w_r - paddle.device.cuda.empty_cache() - - cur_best_idx = paddle.where(err < min_errs) - if cur_best_idx[0].shape[0] != 0: - min_errs[cur_best_idx] = err[cur_best_idx] - best_max_val[cur_best_idx] = max_val[cur_best_idx] - best_max_val_all.append(best_max_val) - - del org_out, org_max_val, min_errs, best_max_val, err, cur_best_idx, max_val_tmp, max_val, w - paddle.device.cuda.empty_cache() - - best_max_val = paddle.concat(best_max_val_all, axis=0) - best_max_val = paddle.squeeze(best_max_val, axis=1) - for param in sub_layer.parameters(include_sublayers=False): - if 'w_0' in param.name: - param_tmp = param.transpose(perm=[1, 0]).cast('float16') - tmp_shape = param_tmp.shape - param_tmp = param_tmp.reshape( - [best_max_val.shape[0], best_max_val.shape[1], -1]) - best_max_val = paddle.tile( - best_max_val, repeat_times=(1, 1, param_tmp.shape[-1])) - param_tmp = paddle.where(param_tmp > best_max_val, - best_max_val, param_tmp) - param_tmp = paddle.where(param_tmp < -best_max_val, - -best_max_val, param_tmp) - param_tmp = param_tmp.reshape(tmp_shape).cast(param.dtype) - param_tmp = param_tmp.transpose(perm=[1, 0]) - paddle.assign(param_tmp, output=param) - del param_tmp - paddle.device.cuda.empty_cache() - break - - del best_max_val, weight_t, x, weight, self.sampled_inputs[ - name], w_all, best_max_val_all - paddle.device.cuda.empty_cache()