From 0bf10c913cf02fe5d7d87cb00a52b280436873d6 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Wed, 14 Jun 2023 10:11:46 +0800
Subject: [PATCH 1/3] Change in_dygraph_mode to in_dynamic_mode (#1764)

---
 paddleslim/nas/ofa/layers.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddleslim/nas/ofa/layers.py b/paddleslim/nas/ofa/layers.py
index f8edd9aab..1477b8070 100644
--- a/paddleslim/nas/ofa/layers.py
+++ b/paddleslim/nas/ofa/layers.py
@@ -20,7 +20,6 @@
 from ...common import get_logger
 from .utils.utils import compute_start_end, get_same_padding, convert_to_list
 from .layers_base import *
-from paddle.framework import in_dygraph_mode
 
 __all__ = [
     'SuperConv2D', 'SuperConv2DTranspose', 'SuperSeparableConv2D',
@@ -985,7 +984,7 @@ def forward(self, input):
                  "use_global_stats", self._use_global_stats,
                  "trainable_statistics", trainable_statistics)
 
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             paddle_compile = os.environ.get("paddle_compile")
             if feature_dim != self._mean.shape[0]:
                 if not paddle_compile or "Develop" in paddle_compile:

From a5ba49f5f31eff2f1babf7e28ee15f11cfd1d647 Mon Sep 17 00:00:00 2001
From: lizexu123 <39205361+lizexu123@users.noreply.github.com>
Date: Tue, 30 Jan 2024 10:55:20 +0800
Subject: [PATCH 2/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=87=AA=E7=84=B6?=
 =?UTF-8?q?=E8=AF=AD=E8=A8=80=E5=A4=84=E7=90=86=E6=A8=A1=E5=9E=8B=E8=87=AA?=
 =?UTF-8?q?=E5=8A=A8=E5=8E=8B=E7=BC=A9=E7=A4=BA=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 example/auto_compression/nlp/README.md        |  48 +++--
 .../nlp/configs/ernie3.0/tnews.yaml           |  23 ++-
 .../nlp/configs/pp-minilm/auto/afqmc.yaml     |  20 +-
 .../nlp/paddle_inference_eval.py              |  21 ++-
 paddleslim/quant/advanced/auto_clip.py        | 172 ++++++++++++++++++
 paddleslim/quant/advanced/gptq.py             |   2 +-
 6 files changed, 238 insertions(+), 48 deletions(-)
 create mode 100644 paddleslim/quant/advanced/auto_clip.py

diff --git a/example/auto_compression/nlp/README.md b/example/auto_compression/nlp/README.md
index c98f1987e..da35eebcc 100644
--- a/example/auto_compression/nlp/README.md
+++ b/example/auto_compression/nlp/README.md
@@ -56,16 +56,16 @@
 
 #### 3.1 准备环境
 - python >= 3.6
-- PaddlePaddle >= 2.4 （可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载安装）
-- PaddleSlim >= 2.4
-- PaddleNLP >= 2.3
+- PaddlePaddle ==2.5 （可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载安装）
+- PaddleSlim ==2.5
+- PaddleNLP ==2.6
 
 安装paddlepaddle：
 ```shell
 # CPU
-pip install paddlepaddle==2.4.1
+pip install paddlepaddle==2.5.0
 # GPU 以Ubuntu、CUDA 11.2为例
-python -m pip install paddlepaddle-gpu==2.4.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+python -m pip install paddlepaddle-gpu==2.5.0.post116 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
 ```
 
 安装paddleslim：
@@ -95,7 +95,6 @@ pip install paddlenlp
 |:------:|:------:|:------:|:------:|:------:|:-----------:|:------:|:------:|
 | PP-MiniLM | [afqmc](https://bj.bcebos.com/v1/paddle-slim-models/act/afqmc.tar) | [tnews](https://bj.bcebos.com/v1/paddle-slim-models/act/tnews.tar) | [iflytek](https://bj.bcebos.com/v1/paddle-slim-models/act/iflytek.tar) | [cmnli](https://bj.bcebos.com/v1/paddle-slim-models/act/cmnli.tar) | [ ocnli](https://bj.bcebos.com/v1/paddle-slim-models/act/ocnli.tar) | [cluewsc2020](https://bj.bcebos.com/v1/paddle-slim-models/act/cluewsc.tar) | [csl](https://bj.bcebos.com/v1/paddle-slim-models/act/csl.tar) |
 | ERNIE 3.0-Medium | [afqmc](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/AFQMC.tar) | [tnews](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/TNEWS.tar) | [iflytek](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/IFLYTEK.tar) | [cmnli](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CMNLI.tar) | [ocnli](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/OCNLI.tar) | [cluewsc2020](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CLUEWSC2020.tar) | [csl](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CSL.tar) |
-| UIE-base | [报销工单](https://bj.bcebos.com/v1/paddle-slim-models/act/uie_base.tar) |
 
 从上表获得模型超链接, 并用以下命令下载推理模型文件:
 
@@ -119,11 +118,6 @@ export CUDA_VISIBLE_DEVICES=0
 python run.py --config_path='./configs/pp-minilm/auto/afqmc.yaml' --save_dir='./save_afqmc_pruned/'
 ```
 
-自动压缩UIE系列模型需要使用 run_uie.py 脚本启动，会使用接口```paddleslim.auto_compression.AutoCompression```对模型进行自动压缩。配置config文件中训练部分的参数，将任务名称、模型类型、数据集名称、压缩参数传入，配置完成后便可对模型进行蒸馏量化训练。
-```shell
-export CUDA_VISIBLE_DEVICES=0
-python run_uie.py --config_path='./configs/uie/uie_base.yaml' --save_dir='./save_uie_qat/'
-```
 
 如仅需验证模型精度，或验证压缩之后模型精度，在启动```run.py```脚本时，将配置文件中模型文件夹 ```model_dir``` 改为压缩之后保存的文件夹路径 ```./save_afqmc_pruned``` ，命令加上```--eval True```即可：
 ```shell
@@ -217,8 +211,6 @@ QuantPost:
 
 - TensorRT预测：
 
-环境配置：如果使用 TesorRT 预测引擎，需安装 ```WITH_TRT=ON``` 的Paddle，下载地址：[Python预测库](https://paddleinference.paddlepaddle.org.cn/master/user_guides/download_lib.html#python)
-
 首先下载量化好的模型：
 ```shell
 wget https://bj.bcebos.com/v1/paddle-slim-models/act/save_ppminilm_afqmc_new_calib.tar
@@ -227,10 +219,30 @@ tar -xf save_ppminilm_afqmc_new_calib.tar
 
 ```shell
 python paddle_inference_eval.py \
-      --model_path=save_ernie3_afqmc_new_cablib \
+      --model_path=save_ppminilm_afqmc_new_calib \
+      --model_filename=inference.pdmodel \
+      --params_filename=inference.pdiparams \
+      --task_name='afqmc' \
+      --use_trt \
+      --precision=int8
+```
+
+- ERNIE 3.0-Medium:
+```shell
+python paddle_inference_eval.py \
+      --model_path=TNEWS \
       --model_filename=infer.pdmodel \
       --params_filename=infer.pdiparams \
-      --task_name='afqmc' \
+      --task_name='tnews' \
+      --use_trt \
+      --precision=fp32
+```
+```shell
+python paddle_inference_eval.py \
+      --model_path=save_tnews_pruned \
+      --model_filename=infer.pdmodel \
+      --params_filename=infer.pdiparams \
+      --task_name='tnews' \
       --use_trt \
       --precision=int8
 ```
@@ -239,9 +251,9 @@ python paddle_inference_eval.py \
 
 ```shell
 python paddle_inference_eval.py \
-      --model_path=save_ernie3_afqmc_new_cablib \
-      --model_filename=infer.pdmodel \
-      --params_filename=infer.pdiparams \
+      --model_path=save_ppminilm_afqmc_new_calib \
+      --model_filename=inference.pdmodel \
+      --params_filename=inference.pdiparams \
       --task_name='afqmc' \
       --device=cpu \
       --use_mkldnn=True \
diff --git a/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml b/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml
index 49093ab87..b90da628a 100644
--- a/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml
+++ b/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml
@@ -6,12 +6,17 @@ Global:
   dataset: clue
   batch_size: 16
   max_seq_length: 128
-TrainConfig:
-  epochs: 6
-  eval_iter: 1110
-  learning_rate: 2.0e-5
-  optimizer_builder:
-    optimizer: 
-      type: AdamW
-    weight_decay: 0.01
-  origin_metric: 0.5700
+
+# 剪枝
+Prune:
+  prune_algo: transformer_pruner
+  pruned_ratio: 0.25
+
+# 离线量化
+QuantPost:
+  activation_bits: 8
+  quantize_op_types:
+  - depthwise_conv2d
+  - conv2d
+  weight_bits: 8
+  
\ No newline at end of file
diff --git a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml
index 9c9f58826..fdf65673b 100644
--- a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml
+++ b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml
@@ -6,17 +6,11 @@ Global:
   dataset: clue
   batch_size: 16
   max_seq_length: 128
-TransformerPrune:
-  pruned_ratio: 0.25
-HyperParameterOptimization:
-Distillation:
+
+#离线量化
 QuantPost:
-TrainConfig:
-  epochs: 6
-  eval_iter: 1070
-  learning_rate: 2.0e-5
-  optimizer_builder:
-    optimizer: 
-      type: AdamW
-    weight_decay: 0.01
-  origin_metric: 0.7403
+  activation_bits: 8
+  quantize_op_types: 
+  - conv2d
+  - depthwise_conv2d
+  weight_bits: 8
\ No newline at end of file
diff --git a/example/auto_compression/nlp/paddle_inference_eval.py b/example/auto_compression/nlp/paddle_inference_eval.py
index f48e20698..073f032e5 100644
--- a/example/auto_compression/nlp/paddle_inference_eval.py
+++ b/example/auto_compression/nlp/paddle_inference_eval.py
@@ -91,7 +91,8 @@ def parse_args():
         "--max_seq_length",
         default=128,
         type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
+        help=
+        "The maximum total input sequence length after tokenization. Sequences longer "
         "than this will be truncated, sequences shorter will be padded.", )
     parser.add_argument(
         "--perf_warmup_steps",
@@ -107,7 +108,8 @@ def parse_args():
         type=str,
         default="fp32",
         choices=["fp32", "fp16", "int8"],
-        help="The precision of inference. It can be 'fp32', 'fp16' or 'int8'. Default is 'fp16'.",
+        help=
+        "The precision of inference. It can be 'fp32', 'fp16' or 'int8'. Default is 'fp16'.",
     )
     parser.add_argument(
         "--use_mkldnn",
@@ -156,8 +158,7 @@ def _convert_example(example,
             }
         elif "target" in example:  # wsc
             text, query, pronoun, query_idx, pronoun_idx = (
-                example["text"],
-                example["target"]["span1_text"],
+                example["text"], example["target"]["span1_text"],
                 example["target"]["span2_text"],
                 example["target"]["span1_index"],
                 example["target"]["span2_index"], )
@@ -209,6 +210,12 @@ def create_predictor(cls, args):
         config = paddle.inference.Config(
             os.path.join(args.model_path, args.model_filename),
             os.path.join(args.model_path, args.params_filename))
+        config.switch_ir_debug(True)
+        # 适用于ERNIE 3.0-Medium模型
+        # config.exp_disable_tensorrt_ops(["elementwise_add"])
+        # config.exp_disable_tensorrt_ops(["fused_embedding_eltwise_layernorm"])
+        # config.exp_disable_tensorrt_ops(["tmp_3"])
+
         if args.device == "gpu":
             # set GPU configs accordingly
             config.enable_use_gpu(100, 0)
@@ -239,8 +246,8 @@ def create_predictor(cls, args):
             dynamic_shape_file = os.path.join(args.model_path,
                                               "dynamic_shape.txt")
             if os.path.exists(dynamic_shape_file):
-                config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file,
-                                                           True)
+                config.enable_tuned_tensorrt_dynamic_shape(
+                    dynamic_shape_file, True)
                 print("trt set dynamic shape done!")
             else:
                 config.collect_shape_range_info(dynamic_shape_file)
@@ -365,4 +372,4 @@ def main():
 
 if __name__ == "__main__":
     paddle.set_device("cpu")
-    main()
+    main()
\ No newline at end of file
diff --git a/paddleslim/quant/advanced/auto_clip.py b/paddleslim/quant/advanced/auto_clip.py
new file mode 100644
index 000000000..ac7166ed7
--- /dev/null
+++ b/paddleslim/quant/advanced/auto_clip.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2023  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+AutoClip.
+"""
+import paddle
+import paddle.nn as nn
+import numpy as np
+from .utils import fake_quant
+from .metrics import mse_loss
+from paddle.distributed.fleet.meta_parallel import (
+    ColumnParallelLinear,
+    RowParallelLinear, )
+__all__ = ['AutoClip']
+
+
+class AutoClip(nn.Layer):
+    """
+    AutoClip from AWQ[https://arxiv.org/abs/2306.00978]
+    """
+
+    def __init__(
+            self,
+            model,
+            weight_bits=4,
+            weight_quant_method='groupwise',
+            loss_function=mse_loss,
+            sample_function=None,
+            n_grid=20,
+            max_shrink=0.5,
+            n_sample_token=512,
+            group_size=128, ):
+        super(AutoClip, self).__init__()
+        self.model = model
+        self.weight_bits = weight_bits
+        self.weight_method = weight_quant_method
+        self.loss_function = loss_function
+        self.n_grid = n_grid
+        self.max_shrink = max_shrink
+        self.n_sample_token = n_sample_token
+        self.bnt = (1 << (self.weight_bits - 1)) - 1
+        self.sampled_inputs = {}
+        self.sample_function = sample_function
+        self.group_size = group_size
+
+        self._apply_hook()
+
+    def _apply_hook(self):
+        self._forward_hook_list = []
+        for _, sub_layer in self.model.named_sublayers():
+            if type(sub_layer) in [
+                    ColumnParallelLinear, RowParallelLinear, paddle.nn.Linear
+            ]:
+                forward_pre_hook_handle = sub_layer.register_forward_pre_hook(
+                    self._forward_pre_hook)
+                self._forward_hook_list.append(forward_pre_hook_handle)
+
+    def _forward_pre_hook(self, layer, input):
+        self._sample_scale(input, layer.full_name())
+        return input
+
+    def _sample_scale(self, input, name):
+        input = input[0] if type(input) == tuple else input
+        input.stop_gradient = True
+        if name not in self.sampled_inputs:
+            self.sampled_inputs[name] = input
+        else:
+            if self.sample_function is not None:
+                self.sampled_inputs[name] = self.sample_function.sample(
+                    input, self.sampled_inputs[name], name)
+            else:
+                self.sampled_inputs[name] = input
+
+    def auto_clip(self, group_size=128, oc_batch_size=256):
+        """
+        search clip scale for each layer and update the layer's weight
+        """
+        for sub_name, sub_layer in self.model.named_sublayers():
+            name = sub_layer.full_name()
+            if name not in self.sampled_inputs or 'out_linear' in sub_name:
+                continue
+
+            weight = sub_layer.weight.cast('float16')
+            weight_t = paddle.transpose(weight, perm=[1, 0])
+            x = self.sampled_inputs[name].cast('float16')
+            print('AutoClipping', sub_name, name, x.shape, weight.shape)
+            x = x.reshape([-1, x.shape[-1]])
+            x = x.reshape([1, x.shape[0], -1, group_size])
+            x = x[:, 0::x.shape[1] // self.n_sample_token]
+            weight_t = weight_t.reshape([weight_t.shape[0], 1, -1, group_size])
+            oc_batch_size = oc_batch_size if weight_t.shape[
+                0] % oc_batch_size == 0 else 128  # prevent OOM
+            assert weight_t.shape[0] % oc_batch_size == 0
+
+            w_all = weight_t
+            best_max_val_all = []
+
+            for i_b in range(weight_t.shape[0] // oc_batch_size):
+                w = w_all[i_b * oc_batch_size:(i_b + 1) * oc_batch_size]
+
+                org_max_val = w.abs().max(
+                    axis=-1, keepdim=True)  # co, 1, n_group, 1
+                best_max_val = org_max_val.clone()
+                min_errs = paddle.ones_like(org_max_val, dtype='float16') * 1e9
+                org_out = (x * w).sum(axis=-1)  # co, n_token, n_group
+                for i_s in range(int(self.max_shrink * self.n_grid)):
+                    max_val = org_max_val * (1 - i_s / self.n_grid)
+                    max_val_tmp = max_val
+                    cur_w = paddle.where(w > max_val_tmp, max_val_tmp, w)
+                    cur_w = paddle.where(cur_w < -max_val_tmp, -max_val_tmp,
+                                         cur_w)
+                    org_w_shape = cur_w.shape
+                    cur_w_r = cur_w.reshape([-1,
+                                             self.group_size]).transpose([1, 0])
+                    quant_dequant_weight = fake_quant(
+                        cur_w_r, method='abs_max_channel_wise', weight_bits=4)
+                    quant_dequant_weight = quant_dequant_weight.transpose(
+                        [1, 0]).reshape(org_w_shape)
+                    cur_out = (x * quant_dequant_weight).sum(axis=-1)
+                    # co, 1, n_group, 1
+                    tmp = (cur_out - org_out).detach().clone()
+                    err = paddle.pow(tmp,
+                                     2).mean(axis=1).reshape(min_errs.shape)
+                    print('block {} search s {} err {}'.format(
+                        i_b, i_s, err.mean().item()))
+                    del cur_w, cur_out, quant_dequant_weight, tmp, cur_w_r
+                    paddle.device.cuda.empty_cache()
+
+                    cur_best_idx = paddle.where(err < min_errs)
+                    if cur_best_idx[0].shape[0] != 0:
+                        min_errs[cur_best_idx] = err[cur_best_idx]
+                        best_max_val[cur_best_idx] = max_val[cur_best_idx]
+                best_max_val_all.append(best_max_val)
+
+                del org_out, org_max_val, min_errs, best_max_val, err, cur_best_idx, max_val_tmp, max_val, w
+                paddle.device.cuda.empty_cache()
+
+            best_max_val = paddle.concat(best_max_val_all, axis=0)
+            best_max_val = paddle.squeeze(best_max_val, axis=1)
+            for param in sub_layer.parameters(include_sublayers=False):
+                if 'w_0' in param.name:
+                    param_tmp = param.transpose(perm=[1, 0]).cast('float16')
+                    tmp_shape = param_tmp.shape
+                    param_tmp = param_tmp.reshape(
+                        [best_max_val.shape[0], best_max_val.shape[1], -1])
+                    best_max_val = paddle.tile(
+                        best_max_val, repeat_times=(1, 1, param_tmp.shape[-1]))
+                    param_tmp = paddle.where(param_tmp > best_max_val,
+                                             best_max_val, param_tmp)
+                    param_tmp = paddle.where(param_tmp < -best_max_val,
+                                             -best_max_val, param_tmp)
+                    param_tmp = param_tmp.reshape(tmp_shape).cast(param.dtype)
+                    param_tmp = param_tmp.transpose(perm=[1, 0])
+                    paddle.assign(param_tmp, output=param)
+                    del param_tmp
+                    paddle.device.cuda.empty_cache()
+                    break
+
+            del best_max_val, weight_t, x, weight, self.sampled_inputs[
+                name], w_all, best_max_val_all
+            paddle.device.cuda.empty_cache()
diff --git a/paddleslim/quant/advanced/gptq.py b/paddleslim/quant/advanced/gptq.py
index 96566858f..f5b73971f 100644
--- a/paddleslim/quant/advanced/gptq.py
+++ b/paddleslim/quant/advanced/gptq.py
@@ -182,4 +182,4 @@ def fasterquant(self,
 
         self.quantized = True
         del H, Q, Hinv, W, Losses
-        paddle.device.cuda.empty_cache()
+        paddle.device.cuda.empty_cache()
\ No newline at end of file

From ec0f5a30def49fec65777d888e155a8dbd280801 Mon Sep 17 00:00:00 2001
From: lizexu123 <39205361+lizexu123@users.noreply.github.com>
Date: Tue, 30 Jan 2024 10:55:20 +0800
Subject: [PATCH 3/3] =?UTF-8?q?=E8=87=AA=E7=84=B6=E8=AF=AD=E8=A8=80?=
 =?UTF-8?q?=E5=A4=84=E7=90=86=E6=A8=A1=E5=9E=8B=E8=87=AA=E5=8A=A8=E5=8E=8B?=
 =?UTF-8?q?=E7=BC=A9=E7=A4=BA=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddleslim/quant/advanced/auto_clip.py | 172 -------------------------
 1 file changed, 172 deletions(-)
 delete mode 100644 paddleslim/quant/advanced/auto_clip.py

diff --git a/paddleslim/quant/advanced/auto_clip.py b/paddleslim/quant/advanced/auto_clip.py
deleted file mode 100644
index ac7166ed7..000000000
--- a/paddleslim/quant/advanced/auto_clip.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2023  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-AutoClip.
-"""
-import paddle
-import paddle.nn as nn
-import numpy as np
-from .utils import fake_quant
-from .metrics import mse_loss
-from paddle.distributed.fleet.meta_parallel import (
-    ColumnParallelLinear,
-    RowParallelLinear, )
-__all__ = ['AutoClip']
-
-
-class AutoClip(nn.Layer):
-    """
-    AutoClip from AWQ[https://arxiv.org/abs/2306.00978]
-    """
-
-    def __init__(
-            self,
-            model,
-            weight_bits=4,
-            weight_quant_method='groupwise',
-            loss_function=mse_loss,
-            sample_function=None,
-            n_grid=20,
-            max_shrink=0.5,
-            n_sample_token=512,
-            group_size=128, ):
-        super(AutoClip, self).__init__()
-        self.model = model
-        self.weight_bits = weight_bits
-        self.weight_method = weight_quant_method
-        self.loss_function = loss_function
-        self.n_grid = n_grid
-        self.max_shrink = max_shrink
-        self.n_sample_token = n_sample_token
-        self.bnt = (1 << (self.weight_bits - 1)) - 1
-        self.sampled_inputs = {}
-        self.sample_function = sample_function
-        self.group_size = group_size
-
-        self._apply_hook()
-
-    def _apply_hook(self):
-        self._forward_hook_list = []
-        for _, sub_layer in self.model.named_sublayers():
-            if type(sub_layer) in [
-                    ColumnParallelLinear, RowParallelLinear, paddle.nn.Linear
-            ]:
-                forward_pre_hook_handle = sub_layer.register_forward_pre_hook(
-                    self._forward_pre_hook)
-                self._forward_hook_list.append(forward_pre_hook_handle)
-
-    def _forward_pre_hook(self, layer, input):
-        self._sample_scale(input, layer.full_name())
-        return input
-
-    def _sample_scale(self, input, name):
-        input = input[0] if type(input) == tuple else input
-        input.stop_gradient = True
-        if name not in self.sampled_inputs:
-            self.sampled_inputs[name] = input
-        else:
-            if self.sample_function is not None:
-                self.sampled_inputs[name] = self.sample_function.sample(
-                    input, self.sampled_inputs[name], name)
-            else:
-                self.sampled_inputs[name] = input
-
-    def auto_clip(self, group_size=128, oc_batch_size=256):
-        """
-        search clip scale for each layer and update the layer's weight
-        """
-        for sub_name, sub_layer in self.model.named_sublayers():
-            name = sub_layer.full_name()
-            if name not in self.sampled_inputs or 'out_linear' in sub_name:
-                continue
-
-            weight = sub_layer.weight.cast('float16')
-            weight_t = paddle.transpose(weight, perm=[1, 0])
-            x = self.sampled_inputs[name].cast('float16')
-            print('AutoClipping', sub_name, name, x.shape, weight.shape)
-            x = x.reshape([-1, x.shape[-1]])
-            x = x.reshape([1, x.shape[0], -1, group_size])
-            x = x[:, 0::x.shape[1] // self.n_sample_token]
-            weight_t = weight_t.reshape([weight_t.shape[0], 1, -1, group_size])
-            oc_batch_size = oc_batch_size if weight_t.shape[
-                0] % oc_batch_size == 0 else 128  # prevent OOM
-            assert weight_t.shape[0] % oc_batch_size == 0
-
-            w_all = weight_t
-            best_max_val_all = []
-
-            for i_b in range(weight_t.shape[0] // oc_batch_size):
-                w = w_all[i_b * oc_batch_size:(i_b + 1) * oc_batch_size]
-
-                org_max_val = w.abs().max(
-                    axis=-1, keepdim=True)  # co, 1, n_group, 1
-                best_max_val = org_max_val.clone()
-                min_errs = paddle.ones_like(org_max_val, dtype='float16') * 1e9
-                org_out = (x * w).sum(axis=-1)  # co, n_token, n_group
-                for i_s in range(int(self.max_shrink * self.n_grid)):
-                    max_val = org_max_val * (1 - i_s / self.n_grid)
-                    max_val_tmp = max_val
-                    cur_w = paddle.where(w > max_val_tmp, max_val_tmp, w)
-                    cur_w = paddle.where(cur_w < -max_val_tmp, -max_val_tmp,
-                                         cur_w)
-                    org_w_shape = cur_w.shape
-                    cur_w_r = cur_w.reshape([-1,
-                                             self.group_size]).transpose([1, 0])
-                    quant_dequant_weight = fake_quant(
-                        cur_w_r, method='abs_max_channel_wise', weight_bits=4)
-                    quant_dequant_weight = quant_dequant_weight.transpose(
-                        [1, 0]).reshape(org_w_shape)
-                    cur_out = (x * quant_dequant_weight).sum(axis=-1)
-                    # co, 1, n_group, 1
-                    tmp = (cur_out - org_out).detach().clone()
-                    err = paddle.pow(tmp,
-                                     2).mean(axis=1).reshape(min_errs.shape)
-                    print('block {} search s {} err {}'.format(
-                        i_b, i_s, err.mean().item()))
-                    del cur_w, cur_out, quant_dequant_weight, tmp, cur_w_r
-                    paddle.device.cuda.empty_cache()
-
-                    cur_best_idx = paddle.where(err < min_errs)
-                    if cur_best_idx[0].shape[0] != 0:
-                        min_errs[cur_best_idx] = err[cur_best_idx]
-                        best_max_val[cur_best_idx] = max_val[cur_best_idx]
-                best_max_val_all.append(best_max_val)
-
-                del org_out, org_max_val, min_errs, best_max_val, err, cur_best_idx, max_val_tmp, max_val, w
-                paddle.device.cuda.empty_cache()
-
-            best_max_val = paddle.concat(best_max_val_all, axis=0)
-            best_max_val = paddle.squeeze(best_max_val, axis=1)
-            for param in sub_layer.parameters(include_sublayers=False):
-                if 'w_0' in param.name:
-                    param_tmp = param.transpose(perm=[1, 0]).cast('float16')
-                    tmp_shape = param_tmp.shape
-                    param_tmp = param_tmp.reshape(
-                        [best_max_val.shape[0], best_max_val.shape[1], -1])
-                    best_max_val = paddle.tile(
-                        best_max_val, repeat_times=(1, 1, param_tmp.shape[-1]))
-                    param_tmp = paddle.where(param_tmp > best_max_val,
-                                             best_max_val, param_tmp)
-                    param_tmp = paddle.where(param_tmp < -best_max_val,
-                                             -best_max_val, param_tmp)
-                    param_tmp = param_tmp.reshape(tmp_shape).cast(param.dtype)
-                    param_tmp = param_tmp.transpose(perm=[1, 0])
-                    paddle.assign(param_tmp, output=param)
-                    del param_tmp
-                    paddle.device.cuda.empty_cache()
-                    break
-
-            del best_max_val, weight_t, x, weight, self.sampled_inputs[
-                name], w_all, best_max_val_all
-            paddle.device.cuda.empty_cache()