From 49aeb9cbeef4352c3bbb61c88028612880c28afe Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Fri, 28 Jun 2024 14:03:21 +0800 Subject: [PATCH] Add W8A8 quant and examples (#24) Signed-off-by: Mengni Wang --- examples/.config/model_params_onnxrt.json | 30 +- .../quantization/ptq_static/README.md | 63 + .../resnet50/quantization/ptq_static/main.py | 277 ++ .../quantization/ptq_static/prepare_model.py | 57 + .../quantization/ptq_static/requirements.txt | 6 + .../quantization/ptq_static/run_benchmark.sh | 49 + .../quantization/ptq_static/run_quant.sh | 47 + .../bert/quantization/ptq_dynamic/README.md | 51 + .../nlp/bert/quantization/ptq_dynamic/main.py | 442 ++++ .../quantization/ptq_dynamic/prepare_data.sh | 34 + .../quantization/ptq_dynamic/prepare_model.py | 118 + .../quantization/ptq_dynamic/requirements.txt | 8 + .../quantization/ptq_dynamic/run_benchmark.sh | 64 + .../quantization/ptq_dynamic/run_quant.sh | 47 + .../bert/quantization/ptq_static/README.md | 60 + .../nlp/bert/quantization/ptq_static/main.py | 509 ++++ .../quantization/ptq_static/prepare_data.sh | 34 + .../quantization/ptq_static/prepare_model.py | 118 + .../quantization/ptq_static/requirements.txt | 8 + .../quantization/ptq_static/run_benchmark.sh | 64 + .../bert/quantization/ptq_static/run_quant.sh | 53 + .../llama/quantization/weight_only/main.py | 26 +- onnx_neural_compressor/__init__.py | 2 - .../algorithms/layer_wise/core.py | 17 +- .../post_training_quant/__init__.py | 13 + .../post_training_quant/calibrate.py | 637 +++++ .../post_training_quant/calibrator.py | 401 +++ .../post_training_quant/operators/__init__.py | 27 + .../operators/activation.py | 112 + .../post_training_quant/operators/argmax.py | 40 + .../operators/attention.py | 71 + .../post_training_quant/operators/base_op.py | 92 + .../operators/binary_op.py | 150 ++ .../post_training_quant/operators/concat.py | 125 + .../post_training_quant/operators/conv.py | 201 ++ .../operators/direct_q8.py | 78 + .../operators/embed_layernorm.py | 68 + .../post_training_quant/operators/gather.py | 109 + .../post_training_quant/operators/gavgpool.py | 59 + .../post_training_quant/operators/gemm.py | 91 + .../post_training_quant/operators/lstm.py | 138 + .../post_training_quant/operators/matmul.py | 168 ++ .../post_training_quant/operators/maxpool.py | 74 + .../post_training_quant/operators/pad.py | 102 + .../post_training_quant/operators/pooling.py | 81 + .../post_training_quant/operators/reduce.py | 83 + .../post_training_quant/operators/resize.py | 75 + .../post_training_quant/operators/split.py | 88 + .../post_training_quant/operators/unary_op.py | 80 + .../post_training_quant/quantizer.py | 1246 +++++++++ .../algorithms/smoother/core.py | 59 +- onnx_neural_compressor/algorithms/utility.py | 702 +++++ .../algorithms/weight_only/awq.py | 150 +- .../algorithms/weight_only/gptq.py | 105 +- .../algorithms/weight_only/rtn.py | 79 +- .../algorithms/weight_only/utility.py | 332 --- onnx_neural_compressor/config.py | 1239 --------- onnx_neural_compressor/constants.py | 279 +- onnx_neural_compressor/data_reader.py | 27 +- onnx_neural_compressor/onnx_model.py | 289 ++- .../quantization/__init__.py | 4 +- .../quantization/algorithm_entry.py | 246 +- .../quantization/calibrate.py | 32 - onnx_neural_compressor/quantization/config.py | 2249 +++++++++++++++++ .../quantization/matmul_4bits_quantizer.py | 6 +- .../quantization/matmul_nbits_quantizer.py | 81 +- .../quantization/quant_utils.py | 47 + .../quantization/quantize.py | 47 +- onnx_neural_compressor/quantization/tuning.py | 155 +- onnx_neural_compressor/utility.py | 377 +-- onnx_neural_compressor/version.py | 2 - requirements.txt | 2 + .../layer_wise/test_layer_wise.py | 8 +- .../post_training_quant/test_calibrate.py | 588 +++++ .../post_training_quant/test_operators.py | 1957 ++++++++++++++ .../test_post_training_quant.py | 203 ++ .../post_training_quant/test_quant_utils.py | 62 + test/quantization/test_autotune.py | 228 +- test/quantization/test_config.py | 267 +- test/quantization/test_smooth_quant.py | 45 +- test/quantization/weight_only/test_awq.py | 4 +- test/quantization/weight_only/test_gptq.py | 4 +- test/quantization/weight_only/test_rtn.py | 4 +- test/utils/test_general.py | 111 +- test/utils/test_param.py | 5 +- test/utils/test_utility.py | 20 - 86 files changed, 14081 insertions(+), 2527 deletions(-) create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/README.md create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/main.py create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/prepare_model.py create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/requirements.txt create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/README.md create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/main.py create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/requirements.txt create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh create mode 100644 examples/nlp/bert/quantization/ptq_static/README.md create mode 100644 examples/nlp/bert/quantization/ptq_static/main.py create mode 100644 examples/nlp/bert/quantization/ptq_static/prepare_data.sh create mode 100644 examples/nlp/bert/quantization/ptq_static/prepare_model.py create mode 100644 examples/nlp/bert/quantization/ptq_static/requirements.txt create mode 100644 examples/nlp/bert/quantization/ptq_static/run_benchmark.sh create mode 100644 examples/nlp/bert/quantization/ptq_static/run_quant.sh create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/__init__.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/calibrate.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/calibrator.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/split.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/quantizer.py create mode 100644 onnx_neural_compressor/algorithms/utility.py delete mode 100644 onnx_neural_compressor/algorithms/weight_only/utility.py delete mode 100644 onnx_neural_compressor/config.py delete mode 100644 onnx_neural_compressor/quantization/calibrate.py create mode 100644 onnx_neural_compressor/quantization/config.py create mode 100644 onnx_neural_compressor/quantization/quant_utils.py create mode 100644 test/quantization/post_training_quant/test_calibrate.py create mode 100644 test/quantization/post_training_quant/test_operators.py create mode 100644 test/quantization/post_training_quant/test_post_training_quant.py create mode 100644 test/quantization/post_training_quant/test_quant_utils.py diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json index 5db34a114..085c7ef6c 100644 --- a/examples/.config/model_params_onnxrt.json +++ b/examples/.config/model_params_onnxrt.json @@ -55,6 +55,34 @@ "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past", "main_script": "main.py", "batch_size": 1 - } + }, + "bert_base_MRPC": { + "model_src_dir": "nlp/bert/quantization/ptq_static", + "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", + "input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx", + "main_script": "main.py", + "batch_size": 8 + }, + "bert_base_MRPC_dynamic": { + "model_src_dir": "nlp/bert/quantization/ptq_dynamic", + "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", + "input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx", + "main_script": "main.py", + "batch_size": 8 + }, + "resnet50-v1-12_qdq": { + "model_src_dir": "image_recognition/resnet50/quantization/ptq_static", + "dataset_location": "/tf_dataset2/datasets/imagenet/ImagenetRaw/ILSVRC2012_img_val", + "input_model": "/tf_dataset2/models/onnx/resnet50-v1-12/resnet50-v1-13.onnx", + "main_script": "main.py", + "batch_size": 1 + }, + "resnet50-v1-12": { + "model_src_dir": "image_recognition/resnet50/quantization/ptq_static", + "dataset_location": "/tf_dataset2/datasets/imagenet/ImagenetRaw/ILSVRC2012_img_val", + "input_model": "/tf_dataset2/models/onnx/resnet50-v1-12/resnet50-v1-12.onnx", + "main_script": "main.py", + "batch_size": 1 + }, } } diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/README.md b/examples/image_recognition/resnet50/quantization/ptq_static/README.md new file mode 100644 index 000000000..b8145eff8 --- /dev/null +++ b/examples/image_recognition/resnet50/quantization/ptq_static/README.md @@ -0,0 +1,63 @@ +# Step-by-Step + +This example load an image classification model from [ONNX Model Zoo](https://github.com/onnx/models) and confirm its accuracy and speed based on [ILSVR2012 validation Imagenet dataset](http://www.image-net.org/challenges/LSVRC/2012/downloads). You need to download this dataset yourself. + +# Prerequisite + +## 1. Environment + +```shell +pip install onnx-neural-compressor +pip install -r requirements.txt +``` + + +## 2. Prepare Model + +```shell +python prepare_model.py --output_model='resnet50-v1-12.onnx' +``` + +## 3. Prepare Dataset + +Download dataset [ILSVR2012 validation Imagenet dataset](http://www.image-net.org/challenges/LSVRC/2012/downloads). + +Download label: + +```shell +wget http://dl.caffe.berkeleyvision.org/caffe_ilsvrc12.tar.gz +tar -xvzf caffe_ilsvrc12.tar.gz val.txt +``` + +# Run + + +## 1. Quantization + +Quantize model with QLinearOps: + +```bash +bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx + --dataset_location=/path/to/imagenet \ + --label_path=/path/to/val.txt \ + --output_model=path/to/save +``` + +Quantize model with QDQ mode: + +```bash +bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx + --dataset_location=/path/to/imagenet \ + --label_path=/path/to/val.txt \ + --output_model=path/to/save \ + --quant_format=QDQ +``` + +## 2. Benchmark + +```bash +bash run_benchmark.sh --input_model=path/to/model \ # model path as *.onnx + --dataset_location=/path/to/imagenet \ + --label_path=/path/to/val.txt \ + --mode=performance # or accuracy +``` diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/main.py b/examples/image_recognition/resnet50/quantization/ptq_static/main.py new file mode 100644 index 000000000..8b6506e1e --- /dev/null +++ b/examples/image_recognition/resnet50/quantization/ptq_static/main.py @@ -0,0 +1,277 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint:disable=redefined-outer-name,logging-format-interpolation + +import argparse +import collections +import logging +import os +import re +import time + +import cv2 +import numpy as np +import onnx +import onnxruntime as ort +from PIL import Image +from sklearn import metrics + +from onnx_neural_compressor import data_reader, quantization +from onnx_neural_compressor.quantization import config, tuning + +logger = logging.getLogger(__name__) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN +) + + +def _topk_shape_validate(preds, labels): + # preds shape can be Nxclass_num or class_num(N=1 by default) + # it's more suitable for 'Accuracy' with preds shape Nx1(or 1) output from argmax + if isinstance(preds, int): + preds = [preds] + preds = np.array(preds) + elif isinstance(preds, np.ndarray): + preds = np.array(preds) + elif isinstance(preds, list): + preds = np.array(preds) + preds = preds.reshape((-1, preds.shape[-1])) + + # consider labels just int value 1x1 + if isinstance(labels, int): + labels = [labels] + labels = np.array(labels) + elif isinstance(labels, tuple): + labels = np.array([labels]) + labels = labels.reshape((labels.shape[-1], -1)) + elif isinstance(labels, list): + if isinstance(labels[0], int): + labels = np.array(labels) + labels = labels.reshape((labels.shape[0], 1)) + elif isinstance(labels[0], tuple): + labels = np.array(labels) + labels = labels.reshape((labels.shape[-1], -1)) + else: + labels = np.array(labels) + # labels most have 2 axis, 2 cases: N(or Nx1 sparse) or Nxclass_num(one-hot) + # only support 2 dimension one-shot labels + # or 1 dimension one-hot class_num will confuse with N + + if len(preds.shape) == 1: + N = 1 + class_num = preds.shape[0] + preds = preds.reshape([-1, class_num]) + elif len(preds.shape) >= 2: + N = preds.shape[0] + preds = preds.reshape([N, -1]) + class_num = preds.shape[1] + + label_N = labels.shape[0] + assert label_N == N, "labels batch size should same with preds" + labels = labels.reshape([N, -1]) + # one-hot labels will have 2 dimension not equal 1 + if labels.shape[1] != 1: + labels = labels.argsort()[..., -1:] + return preds, labels + + +class TopK: + def __init__(self, k=1): + self.k = k + self.num_correct = 0 + self.num_sample = 0 + + def update(self, preds, labels, sample_weight=None): + preds, labels = _topk_shape_validate(preds, labels) + preds = preds.argsort()[..., -self.k :] + if self.k == 1: + correct = metrics.accuracy_score(preds, labels, normalize=False) + self.num_correct += correct + + else: + for p, l in zip(preds, labels): + # get top-k labels with np.argpartition + # p = np.argpartition(p, -self.k)[-self.k:] + l = l.astype("int32") + if l in p: + self.num_correct += 1 + + self.num_sample += len(labels) + + def reset(self): + self.num_correct = 0 + self.num_sample = 0 + + def result(self): + if self.num_sample == 0: + logger.warning("Sample num during evaluation is 0.") + return 0 + return self.num_correct / self.num_sample + + +class DataReader(data_reader.CalibrationDataReader): + def __init__(self, model_path, dataset_location, image_list, batch_size=1, calibration_sampling_size=-1): + self.batch_size = batch_size + self.image_list = [] + self.label_list = [] + src_lst = [] + label_lst = [] + num = 0 + with open(image_list, "r") as f: + for s in f: + image_name, label = re.split(r"\s+", s.strip()) + src = os.path.join(dataset_location, image_name) + if not os.path.exists(src): + continue + src_lst.append(src) + label_lst.append(int(label)) + if len(src_lst) == batch_size: + self.image_list.append(src_lst) + self.label_list.append(label_lst) + num += batch_size + if calibration_sampling_size > 0 and num >= calibration_sampling_size: + break + src_lst = [] + label_lst = [] + if len(src_lst) > 0: + self.image_list.append(src_lst) + self.label_list.append(label_lst) + model = onnx.load(model_path, load_external_data=False) + self.inputs_names = [input.name for input in model.graph.input] + self.iter_next = iter(self.image_list) + + def _preprpcess(self, src): + with Image.open(src) as image: + image = np.array(image.convert("RGB")).astype(np.float32) + image = image / 255.0 + image = cv2.resize(image, (256, 256), interpolation=cv2.INTER_LINEAR) + + h, w = image.shape[0], image.shape[1] + + y0 = (h - 224) // 2 + x0 = (w - 224) // 2 + image = image[y0 : y0 + 224, x0 : x0 + 224, :] + image = (image - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225] + image = image.transpose((2, 0, 1)) + return image.astype("float32") + + def get_next(self): + lst = next(self.iter_next, None) + if lst is not None: + return {self.inputs_names[0]: np.stack([self._preprpcess(src) for src in lst])} + else: + return None + + def rewind(self): + self.iter_next = iter(self.image_list) + + +def eval_func(model, dataloader, metric): + metric.reset() + sess = ort.InferenceSession(model, providers=ort.get_available_providers()) + labels = dataloader.label_list + for idx, batch in enumerate(dataloader): + output = sess.run(None, batch) + metric.update(output, labels[idx]) + return metric.result() + + +if __name__ == "__main__": + logger.info("Evaluating ONNXRuntime full precision accuracy and performance:") + parser = argparse.ArgumentParser( + description="Resnet50 fine-tune examples for image classification tasks.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--model_path", type=str, help="Pre-trained model on onnx file") + parser.add_argument("--dataset_location", type=str, help="Imagenet data path") + parser.add_argument("--label_path", type=str, help="Imagenet label path") + parser.add_argument("--benchmark", action="store_true", default=False) + parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model") + parser.add_argument("--output_model", type=str, help="output model path") + parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy") + parser.add_argument( + "--intra_op_num_threads", type=int, default=4, help="intra_op_num_threads for performance benchmark" + ) + parser.add_argument( + "--quant_format", type=str, default="QOperator", choices=["QDQ", "QOperator"], help="quantization format" + ) + parser.add_argument( + "--batch_size", + default=1, + type=int, + ) + args = parser.parse_args() + + top1 = TopK() + dataloader = DataReader(args.model_path, args.dataset_location, args.label_path, args.batch_size) + + def eval(onnx_model): + dataloader.rewind() + return eval_func(onnx_model, dataloader, top1) + + if args.benchmark: + if args.mode == "performance": + total_time = 0.0 + num_iter = 100 + num_warmup = 10 + + sess_options = ort.SessionOptions() + sess_options.intra_op_num_threads = args.intra_op_num_threads + session = ort.InferenceSession(args.model_path, sess_options, providers=ort.get_available_providers()) + ort_inputs = {} + len_inputs = len(session.get_inputs()) + inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] + + for idx, batch in enumerate(dataloader): + if idx + 1 > num_iter: + break + tic = time.time() + predictions = session.run(None, batch) + toc = time.time() + if idx >= num_warmup: + total_time += toc - tic + + print("\n", "-" * 10, "Summary:", "-" * 10) + print(args) + throughput = (num_iter - num_warmup) / total_time + print("Throughput: {} samples/s".format(throughput)) + elif args.mode == "accuracy": + acc_result = eval_func(args.model_path, dataloader, top1) + print("Batch size = %d" % dataloader.batch_size) + print("Accuracy: %.5f" % acc_result) + + if args.tune: + calibration_data_reader = DataReader( + args.model_path, args.dataset_location, args.label_path, args.batch_size, calibration_sampling_size=100 + ) + + custom_tune_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig.get_config_set_for_tuning( + quant_format=( + quantization.QuantFormat.QOperator + if args.quant_format == "QOperator" + else quantization.QuantFormat.QDQ + ), + ) + ) + best_model = tuning.autotune( + model_input=args.model_path, + tune_config=custom_tune_config, + eval_fn=eval, + calibration_data_reader=calibration_data_reader, + ) + onnx.save(best_model, args.output_model) diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/prepare_model.py b/examples/image_recognition/resnet50/quantization/ptq_static/prepare_model.py new file mode 100644 index 000000000..8d7d8d4a9 --- /dev/null +++ b/examples/image_recognition/resnet50/quantization/ptq_static/prepare_model.py @@ -0,0 +1,57 @@ +import argparse +import os +import sys +import urllib + +MODEL_URL = "https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-v1-12.onnx" +MAX_TIMES_RETRY_DOWNLOAD = 5 + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--input_model", type=str, required=False, default="resnet50-v1-12.onnx") + parser.add_argument("--output_model", type=str, required=True) + return parser.parse_args() + + +def progressbar(cur, total=100): + percent = "{:.2%}".format(cur / total) + sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent)) + sys.stdout.flush() + + +def schedule(blocknum, blocksize, totalsize): + if totalsize == 0: + percent = 0 + else: + percent = min(1.0, blocknum * blocksize / totalsize) * 100 + progressbar(percent) + + +def download_model(url, model_name, retry_times=5): + if os.path.isfile(model_name): + print(f"{model_name} exists, skip download") + return True + + print("download model...") + retries = 0 + while retries < retry_times: + try: + urllib.request.urlretrieve(url, model_name, schedule) + break + except KeyboardInterrupt: + return False + except: + retries += 1 + print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}") + return retries < retry_times + + +def prepare_model(input_model, output_model): + # Download model from [ONNX Model Zoo](https://github.com/onnx/models) + download_model(MODEL_URL, output_model, MAX_TIMES_RETRY_DOWNLOAD) + + +if __name__ == "__main__": + args = parse_arguments() + prepare_model(args.input_model, args.output_model) diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/requirements.txt b/examples/image_recognition/resnet50/quantization/ptq_static/requirements.txt new file mode 100644 index 000000000..1fc10dd8a --- /dev/null +++ b/examples/image_recognition/resnet50/quantization/ptq_static/requirements.txt @@ -0,0 +1,6 @@ +onnx +onnxruntime +torch +torchvision +onnxruntime-extensions +pillow>=8.2.0 # not directly required, pinned by Snyk to avoid a vulnerability diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh b/examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh new file mode 100644 index 000000000..65c3505be --- /dev/null +++ b/examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo "$var" |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo "$var" |cut -f2 -d=) + ;; + --label_path=*) + label_path=$(echo "$var" |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo "$var" |cut -f2 -d=) + ;; + --intra_op_num_threads=*) + intra_op_num_threads=$(echo "$var" |cut -f2 -d=) + ;; + esac + done + +} + +# run_benchmark +function run_benchmark { + + python main.py \ + --model_path "${input_model}" \ + --dataset_location "${dataset_location}" \ + --label_path "${label_path-${dataset_location}/../val.txt}" \ + --mode "${mode}" \ + --batch_size 1 \ + --intra_op_num_threads "${intra_op_num_threads-4}" \ + --benchmark + +} + +main "$@" diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh b/examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh new file mode 100644 index 000000000..0e44d8d02 --- /dev/null +++ b/examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_tuning + +} + +# init params +function init_params { + + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo "$var" |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo "$var" |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo "$var" |cut -f2 -d=) + ;; + --label_path=*) + label_path=$(echo "$var" |cut -f2 -d=) + ;; + --quant_format=*) + quant_format=$(echo "$var" |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + python main.py \ + --model_path "${input_model}" \ + --dataset_location "${dataset_location}" \ + --label_path "${label_path-${dataset_location}/../val.txt}" \ + --output_model "${output_model}" \ + --quant_format "${quant_format-QOperator}" \ + --tune +} + +main "$@" diff --git a/examples/nlp/bert/quantization/ptq_dynamic/README.md b/examples/nlp/bert/quantization/ptq_dynamic/README.md new file mode 100644 index 000000000..212c8b899 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_dynamic/README.md @@ -0,0 +1,51 @@ +# Step-by-Step + +This example load a BERT model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/). + +# Prerequisite + +## 1. Environment + +```shell +pip install onnx-neural-compressor +pip install -r requirements.txt +``` + + +## 2. Prepare Dataset + +download the GLUE data with `prepare_data.sh` script. + +```shell +export GLUE_DIR=path/to/glue_data +export TASK_NAME=MRPC + +bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME +``` + +## 3. Prepare Model + +```shell +python prepare_model.py --input_model='MRPC.zip' --output_model='bert.onnx' +``` + +# Run + +## 1. Quantization + +Dynamic quantization: + +```bash +bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx + --output_model=path/to/model_tune \ # model path as *.onnx + --dataset_location=path/to/glue_data +``` + +## 2. Benchmark + +```bash +bash run_benchmark.sh --input_model=path/to/model \ # model path as *.onnx + --dataset_location=path/to/glue_data \ + --batch_size=batch_size \ + --mode=performance # or accuracy +``` diff --git a/examples/nlp/bert/quantization/ptq_dynamic/main.py b/examples/nlp/bert/quantization/ptq_dynamic/main.py new file mode 100644 index 000000000..38b0b6757 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_dynamic/main.py @@ -0,0 +1,442 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint:disable=redefined-outer-name,logging-format-interpolation + +import argparse +import dataclasses +import logging +import os +import pathlib +import tempfile +import time +from typing import List, Optional, Union + +import numpy as np +import onnx +import onnxruntime +import torch +import transformers +from onnxruntime.transformers import optimizer +from onnxruntime.transformers.fusion_options import FusionOptions +from torch.utils import data + +from onnx_neural_compressor.quantization import config, tuning + +logger = logging.getLogger(__name__) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN +) + + +class ONNXRTBertDataset: + """Dataset used for model Bert. + Args: data_dir (str): The input data dir. + model_name_or_path (str): Path to pre-trained student model or shortcut name, + selected in the list: + max_seq_length (int, default=128): The maximum length after tokenization. + Sequences longer than this will be truncated, + sequences shorter will be padded. + do_lower_case (bool, default=True): Whether to lowercase the input when tokenizing. + task (str, default=mrpc): The name of the task to fine-tune. + Choices include mrpc, qqp, qnli, rte, + sts-b, cola, mnli, wnli. + model_type (str, default='bert'): model type, support 'distilbert', 'bert', + 'mobilebert', 'roberta'. + dynamic_length (bool, default=False): Whether to use fixed sequence length. + evaluate (bool, default=True): Whether do evaluation or training. + transform (transform object, default=None): transform to process input data. + filter (Filter objects, default=None): filter out examples according + to specific conditions. + """ + + def __init__( + self, + model, + data_dir, + model_name_or_path, + max_seq_length=128, + do_lower_case=True, + task="mrpc", + model_type="bert", + dynamic_length=False, + evaluate=True, + transform=None, + filter=None, + ): + self.inputs = [inp.name for inp in onnx.load(model).graph.input] + task = task.lower() + model_type = model_type.lower() + assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type" + assert model_type in [ + "distilbert", + "bert", + "mobilebert", + "roberta", + ], "Unsupported \ + model type" + self.dynamic_length = dynamic_length + self.model_type = model_type + self.max_seq_length = max_seq_length + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) + self.dataset = load_and_cache_examples( + data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate + ) + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, index): + batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in self.dataset[index]) + return batch[: len(self.inputs)], batch[-1] + + +def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate): + processor = transformers.glue_processors[task]() + output_mode = transformers.glue_output_modes[task] + # Load data features from cache or dataset file + if not os.path.exists("./dataset_cached"): + os.makedirs("./dataset_cached") + cached_features_file = os.path.join( + "./dataset_cached", + "cached_{}_{}_{}_{}".format( + "dev" if evaluate else "train", + list(filter(None, model_name_or_path.split("/"))).pop(), + str(max_seq_length), + str(task), + ), + ) + if os.path.exists(cached_features_file): + logger.info("Load features from cached file {}.".format(cached_features_file)) + features = torch.load(cached_features_file) + else: + logger.info("Create features from dataset file at {}.".format(data_dir)) + label_list = processor.get_labels() + examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) + features = convert_examples_to_features( + examples, + tokenizer, + task=task, + label_list=label_list, + max_length=max_seq_length, + output_mode=output_mode, + ) + logger.info("Save features into cached file {}.".format(cached_features_file)) + torch.save(features, cached_features_file) + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + if output_mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif output_mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + dataset = data.TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_seq_lengths, all_labels) + return dataset + + +def convert_examples_to_features( + examples, + tokenizer, + max_length=128, + task=None, + label_list=None, + output_mode="classification", + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True, +): + processor = transformers.glue_processors[task]() + if label_list is None: + label_list = processor.get_labels() + logger.info("Use label list {} for task {}.".format(label_list, task)) + label_map = {label: i for i, label in enumerate(label_list)} + features = [] + for ex_index, example in enumerate(examples): + inputs = tokenizer.encode_plus( + example.text_a, + example.text_b, + add_special_tokens=True, + max_length=max_length, + return_token_type_ids=True, + truncation=True, + ) + input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + seq_length = len(input_ids) + padding_length = max_length - len(input_ids) + + input_ids = input_ids + ([pad_token] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) + + assert len(input_ids) == max_length, "Error with input_ids length {} vs {}".format(len(input_ids), max_length) + assert len(attention_mask) == max_length, "Error with attention_mask length {} vs {}".format( + len(attention_mask), max_length + ) + assert len(token_type_ids) == max_length, "Error with token_type_ids length {} vs {}".format( + len(token_type_ids), max_length + ) + if output_mode == "classification": + label = label_map[example.label] + elif output_mode == "regression": + label = float(example.label) + else: + raise KeyError(output_mode) + + feats = InputFeatures( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label, + seq_length=seq_length, + ) + features.append(feats) + return features + + +@dataclasses.dataclass(frozen=True) +class InputFeatures: + """ + A single set of features of data. + Property names are the same names as the corresponding inputs to a model. + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, + ``0`` for MASKED (padded) tokens. + token_type_ids: (Optional) Segment token indices to indicate first and second + portions of the inputs. Only some models use them. + label: (Optional) Label corresponding to the input. Int for classification problems, + float for regression problems. + seq_length: (Optional) The length of input sequence before padding. + """ + + input_ids: List[int] + attention_mask: Optional[List[int]] = None + token_type_ids: Optional[List[int]] = None + label: Optional[Union[int, float]] = None + seq_length: Optional[List[int]] = None + + +class ONNXRTGLUE: + """Computes GLUE score. + + Args: + task (str, default=mrpc): The name of the task. + Choices include mrpc, qqp, qnli, rte, + sts-b, cola, mnli, wnli. + + """ + + def __init__(self, task="mrpc"): + assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type" + self.pred_list = None + self.label_list = None + self.task = task + self.return_key = { + "cola": "mcc", + "mrpc": "acc", + "sts-b": "corr", + "qqp": "acc", + "mnli": "mnli/acc", + "qnli": "acc", + "rte": "acc", + "wnli": "acc", + "sst-2": "acc", + } + + def update(self, preds, labels): + """add preds and labels to storage""" + if isinstance(preds, list) and len(preds) == 1: + preds = preds[0] + if isinstance(labels, list) and len(labels) == 1: + labels = labels[0] + if self.pred_list is None: + self.pred_list = preds + self.label_list = labels + else: + self.pred_list = np.append(self.pred_list, preds, axis=0) + self.label_list = np.append(self.label_list, labels, axis=0) + + def reset(self): + """clear preds and labels storage""" + self.pred_list = None + self.label_list = None + + def result(self): + """calculate metric""" + output_mode = transformers.glue_output_modes[self.task] + + if output_mode == "classification": + processed_preds = np.argmax(self.pred_list, axis=1) + elif output_mode == "regression": + processed_preds = np.squeeze(self.pred_list) + result = transformers.glue_compute_metrics(self.task, processed_preds, self.label_list) + return result[self.return_key[self.task]] + + +if __name__ == "__main__": + logger.info("Evaluating ONNXRuntime full precision accuracy and performance:") + parser = argparse.ArgumentParser( + description="BERT fine-tune examples for classification/regression tasks.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--model_path", type=str, help="Pre-trained resnet50 model on onnx file") + parser.add_argument("--benchmark", action="store_true", default=False) + parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model") + parser.add_argument("--output_model", type=str, help="output model path") + parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy") + parser.add_argument("--model_name_or_path", type=str, help="pretrained model name or path") + parser.add_argument("--data_path", type=str, help="input data path") + parser.add_argument( + "--batch_size", + default=8, + type=int, + ) + parser.add_argument( + "--task", + type=str, + default="mrpc", + choices=["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], + help="GLUE task name", + ) + parser.add_argument("--dynamic_length", type=bool, default=False, help="dynamic length") + parser.add_argument("--max_seq_length", type=int, default=128, help="max sequence length") + parser.add_argument( + "--model_type", + type=str, + default="bert", + choices=["distilbert", "bert", "mobilebert", "roberta"], + help="model type", + ) + parser.add_argument("--intra_op_num_threads", type=int, default=4) + args = parser.parse_args() + dataset = ONNXRTBertDataset( + args.model_path, + data_dir=args.data_path, + model_name_or_path=args.model_name_or_path, + max_seq_length=args.max_seq_length, + task=args.task, + model_type=args.model_type, + dynamic_length=args.dynamic_length, + ) + dataloader = data.DataLoader( + dataset, + sampler=data.SequentialSampler(dataset), + batch_size=args.batch_size, + shuffle=False, + ) + + def eval_func(model): + metric = ONNXRTGLUE(args.task) + session = onnxruntime.InferenceSession(model, providers=onnxruntime.get_available_providers()) + ort_inputs = {} + len_inputs = len(session.get_inputs()) + inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] + + for idx, batch in enumerate(dataloader): + label = batch[-1] + batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0]) + batch_seq_length = args.max_seq_length if not args.dynamic_length else batch[0].shape[-1] + data = [ + batch[0][:, :batch_seq_length], + batch[1][:, :batch_seq_length], + batch[2][:, :batch_seq_length], + ] + for i in range(len_inputs): + ort_inputs.update({inputs_names[i]: data[i]}) + predictions = session.run(None, ort_inputs) + metric.update(predictions[0], label) + return metric.result() + + if args.benchmark: + if args.mode == "performance": + total_time = 0.0 + num_iter = 100 + num_warmup = 10 + + sess_options = onnxruntime.SessionOptions() + sess_options.intra_op_num_threads = args.intra_op_num_threads + session = onnxruntime.InferenceSession( + args.model_path, sess_options, providers=onnxruntime.get_available_providers() + ) + ort_inputs = {} + len_inputs = len(session.get_inputs()) + inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] + + for idx, batch in enumerate(dataloader): + if idx + 1 > num_iter: + break + + batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0]) + batch_seq_length = args.max_seq_length if not args.dynamic_length else batch[0].shape[-1] + data = [ + batch[0][:, :batch_seq_length], + batch[1][:, :batch_seq_length], + batch[2][:, :batch_seq_length], + ] + for i in range(len_inputs): + ort_inputs.update({inputs_names[i]: data[i]}) + tic = time.time() + predictions = session.run(None, ort_inputs) + toc = time.time() + if idx >= num_warmup: + total_time += toc - tic + + print("\n", "-" * 10, "Summary:", "-" * 10) + print(args) + throughput = (num_iter - num_warmup) / total_time + print("Throughput: {} samples/s".format(throughput)) + elif args.mode == "accuracy": + acc_result = eval_func(args.model_path) + print("Batch size = %d" % args.batch_size) + print("Accuracy: %.5f" % acc_result) + + if args.tune: + # optimize model + with tempfile.TemporaryDirectory(prefix="ort.opt.") as tmp_dir: + opt_options = FusionOptions("bert") + opt_options.enable_embed_layer_norm = False + + model_optimizer = optimizer.optimize_model( + args.model_path, "bert", num_heads=12, hidden_size=768, optimization_options=opt_options + ) + model = model_optimizer.model + + # check the optimized model is valid + try: + onnxruntime.InferenceSession(model.SerializeToString(), providers=onnxruntime.get_available_providers()) + onnx.save(model, pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix()) + model = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix() + except Exception as e: + logger.warning("Optimized model is invalid: {}. ".format(e)) + logger.warning("Model optimizer will be skipped. " "Try to upgrade onnxruntime to avoid this error") + model = args.model_path + + custom_tune_config = tuning.TuningConfig(config_set=config.DynamicQuantConfig.get_config_set_for_tuning()) + best_model = tuning.autotune( + model_input=model, + tune_config=custom_tune_config, + eval_fn=eval_func, + optimization_level=onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL, + ) + onnx.save(best_model, args.output_model) diff --git a/examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh b/examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh new file mode 100644 index 000000000..c1fddb546 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + download_data + +} + +# init params +function init_params { + + for var in "$@" + do + case $var in + --data_dir=*) + data_dir=$(echo "$var" |cut -f2 -d=) + ;; + --task_name=*) + task_name=$(echo "$var" |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function download_data { + wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py + python download_glue_data.py --data_dir="${data_dir}" --tasks="${task_name}" +} + +main "$@" + diff --git a/examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py b/examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py new file mode 100644 index 000000000..5b9216640 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py @@ -0,0 +1,118 @@ +import argparse +import os +import sys +import urllib +import zipfile + +import torch +import transformers + +# Please refer to [Bert-GLUE_OnnxRuntime_quantization guide] +# (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb) +# for detailed model export. + +MODEL_URL = "https://download.pytorch.org/tutorial/MRPC.zip" +MAX_TIMES_RETRY_DOWNLOAD = 5 + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--input_model", type=str, required=False, default="MRPC.zip") + parser.add_argument("--output_model", type=str, required=True) + parser.add_argument("--max_len", type=int, default=128, help="Maximum length of the sentence pairs") + return parser.parse_args() + + +def progressbar(cur, total=100): + percent = "{:.2%}".format(cur / total) + sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent)) + sys.stdout.flush() + + +def schedule(blocknum, blocksize, totalsize): + if totalsize == 0: + percent = 0 + else: + percent = min(1.0, blocknum * blocksize / totalsize) * 100 + progressbar(percent) + + +def is_zip_file(filename): + try: + with open(filename, "rb") as f: + magic_number = f.read(4) + return magic_number == b"PK\x03\x04" # ZIP file magic number + except OSError: + return False + + +def extrafile(filename, target_folder="."): + with zipfile.ZipFile(filename, "r") as zin: + zin.extractall(target_folder) + + +def download_model(url, model_name, retry_times=5): + if os.path.isdir(model_name): + return model_name + elif os.path.exists(model_name) and is_zip_file(model_name): + print("file downloaded") + extrafile(model_name) + return True + + print("download model...") + retries = 0 + while retries < retry_times: + try: + urllib.request.urlretrieve(url, model_name, schedule) + extrafile(model_name) + break + except KeyboardInterrupt: + return False + except: + retries += 1 + print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}") + return retries < retry_times + + +def export_model(model, output_model, max_len=128): + with torch.no_grad(): + inputs = { + "input_ids": torch.ones(1, max_len, dtype=torch.int64), + "attention_mask": torch.ones(1, max_len, dtype=torch.int64), + "token_type_ids": torch.ones(1, max_len, dtype=torch.int64), + } + + symbolic_names = {0: "batch_size", 1: "max_seq_len"} + torch.onnx.export( + model, # model being run + ( + inputs["input_ids"], + inputs["attention_mask"], + inputs["token_type_ids"], + ), # model input (or a tuple for multiple inputs) + output_model, # where to save the model (can be a file or file-like object) + opset_version=14, # the ONNX version to export the model + do_constant_folding=True, # whether to execute constant folding + input_names=["input_ids", "input_mask", "segment_ids"], # the model's input names + output_names=["output"], # the model's output names + dynamic_axes={ + "input_ids": symbolic_names, # variable length axes + "input_mask": symbolic_names, + "segment_ids": symbolic_names, + }, + ) + assert os.path.exists(output_model), f"Export failed! {output_model} doesn't exist!" + print("ONNX Model exported to {0}".format(output_model)) + + +def prepare_model(input_model, output_model, max_len): + is_download_successful = download_model(MODEL_URL, input_model, MAX_TIMES_RETRY_DOWNLOAD) + if is_download_successful: + folder_name = is_download_successful if isinstance(is_download_successful, str) else "./MRPC" + model = transformers.BertForSequenceClassification.from_pretrained(folder_name) + export_model(model, output_model, max_len) + + +if __name__ == "__main__": + args = parse_arguments() + prepare_model(args.input_model, args.output_model, args.max_len) diff --git a/examples/nlp/bert/quantization/ptq_dynamic/requirements.txt b/examples/nlp/bert/quantization/ptq_dynamic/requirements.txt new file mode 100644 index 000000000..85dc725a4 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_dynamic/requirements.txt @@ -0,0 +1,8 @@ +torch +transformers +accelerate +onnx +onnxruntime +coloredlogs +sympy +onnxruntime-extensions diff --git a/examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh b/examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh new file mode 100644 index 000000000..b92ae1ce1 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh @@ -0,0 +1,64 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo "$var" |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo "$var" |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo "$var" |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo "$var" |cut -f2 -d=) + ;; + --intra_op_num_threads=*) + intra_op_num_threads=$(echo "$var" |cut -f2 -d=) + ;; + esac + done + +} + +# run_benchmark +function run_benchmark { + if [[ ${mode} == "accuracy" ]]; then + dynamic_length=False + elif [[ ${mode} == "performance" ]]; then + dynamic_length=True + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + model_name_or_path="bert-base-uncased" + task_name="mrpc" + + python main.py \ + --model_path "${input_model}" \ + --model_name_or_path "${model_name_or_path}" \ + --data_path "${dataset_location}" \ + --task "${task_name}" \ + --batch_size "${batch_size}" \ + --mode "${mode}" \ + --dynamic_length "${dynamic_length}" \ + --intra_op_num_threads "${intra_op_num_threads-4}" \ + --benchmark + +} + +main "$@" + diff --git a/examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh b/examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh new file mode 100644 index 000000000..53e864930 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_tuning +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo "$var" |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo "$var" |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo "$var" |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + model_name_or_path="bert-base-uncased" + batch_size=8 + task_name="mrpc" + + python main.py \ + --model_path "${input_model}" \ + --output_model "${output_model}" \ + --model_name_or_path "${model_name_or_path}" \ + --data_path "${dataset_location}" \ + --task "${task_name}" \ + --batch_size "${batch_size}" \ + --tune +} + +main "$@" + + + diff --git a/examples/nlp/bert/quantization/ptq_static/README.md b/examples/nlp/bert/quantization/ptq_static/README.md new file mode 100644 index 000000000..c34e76a79 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_static/README.md @@ -0,0 +1,60 @@ +Step-by-Step +============ + +This example load a BERT model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/). + +# Prerequisite + +## 1. Environment + +```shell +pip install onnx-neural-compressor +pip install -r requirements.txt +``` + +## 2. Prepare Dataset + +download the GLUE data with `prepare_data.sh` script. +```shell +export GLUE_DIR=path/to/glue_data +export TASK_NAME=MRPC + +bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME +``` + +## 3. Prepare Model + +```shell +python prepare_model.py --input_model='MRPC.zip' --output_model='bert.onnx' +``` + +# Run + +## 1. Quantization + +Static quantization with QOperator format: + +```bash +bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx + --output_model=path/to/model_tune \ + --dataset_location=path/to/glue_data \ + --quant_format="QOperator" +``` + +Static quantization with QDQ format: + +```bash +bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx + --output_model=path/to/model_tune \ # model path as *.onnx + --dataset_location=path/to/glue_data \ + --quant_format="QDQ" +``` + +## 2. Benchmark + +```bash +bash run_benchmark.sh --input_model=path/to/model \ # model path as *.onnx + --dataset_location=path/to/glue_data \ + --batch_size=batch_size \ + --mode=performance # or accuracy +``` diff --git a/examples/nlp/bert/quantization/ptq_static/main.py b/examples/nlp/bert/quantization/ptq_static/main.py new file mode 100644 index 000000000..bfaf55504 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_static/main.py @@ -0,0 +1,509 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint:disable=redefined-outer-name,logging-format-interpolation + +import argparse +import dataclasses +import logging +import os +import pathlib +import tempfile +import time +from typing import List, Optional, Union + +import numpy as np +import onnx +import onnxruntime +import torch +import transformers +from onnxruntime.transformers import optimizer +from onnxruntime.transformers.fusion_options import FusionOptions +from torch.utils import data + +from onnx_neural_compressor import data_reader, quantization +from onnx_neural_compressor.quantization import config, tuning + +logger = logging.getLogger(__name__) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN +) +logger.info("Evaluating ONNXRuntime full precision accuracy and performance:") +parser = argparse.ArgumentParser( + description="BERT fine-tune examples for classification/regression tasks.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +) +parser.add_argument("--model_path", type=str, help="Pre-trained model on onnx file") +parser.add_argument("--benchmark", action="store_true", default=False) +parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model") +parser.add_argument("--output_model", type=str, help="output model path") +parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy") +parser.add_argument("--model_name_or_path", type=str, help="pretrained model name or path") +parser.add_argument("--data_path", type=str, help="input data path") +parser.add_argument( + "--batch_size", + default=8, + type=int, +) +parser.add_argument( + "--task", + type=str, + default="mrpc", + choices=["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], + help="GLUE task name", +) +parser.add_argument( + "--quant_format", type=str, default="QOperator", choices=["QDQ", "QOperator"], help="quantization format" +) +parser.add_argument( + "--intra_op_num_threads", type=int, default=4, help="intra_op_num_threads for performance benchmark" +) +parser.add_argument("--dynamic_length", type=bool, default=False, help="dynamic length") +parser.add_argument("--max_seq_length", type=int, default=128, help="max sequence length") +parser.add_argument( + "--model_type", type=str, default="bert", choices=["distilbert", "bert", "mobilebert", "roberta"], help="model type" +) +parser.add_argument( + "--device", + type=str, + default="cpu", + choices=["cpu", "npu"], +) +args = parser.parse_args() + + +class ONNXRTBertDataset: + """Dataset used for model Bert. + Args: data_dir (str): The input data dir. + model_name_or_path (str): Path to pre-trained student model or shortcut name, + selected in the list: + max_seq_length (int, default=128): The maximum length after tokenization. + Sequences longer than this will be truncated, + sequences shorter will be padded. + do_lower_case (bool, default=True): Whether to lowercase the input when tokenizing. + task (str, default=mrpc): The name of the task to fine-tune. + Choices include mrpc, qqp, qnli, rte, + sts-b, cola, mnli, wnli. + model_type (str, default="bert"): model type, support "distilbert", "bert", + "mobilebert", "roberta". + dynamic_length (bool, default=False): Whether to use fixed sequence length. + evaluate (bool, default=True): Whether do evaluation or training. + transform (transform object, default=None): transform to process input data. + filter (Filter objects, default=None): filter out examples according + to specific conditions. + """ + + def __init__( + self, + model, + data_dir, + model_name_or_path, + max_seq_length=128, + do_lower_case=True, + task="mrpc", + model_type="bert", + dynamic_length=False, + evaluate=True, + transform=None, + filter=None, + ): + self.inputs = [inp.name for inp in onnx.load(model).graph.input] + task = task.lower() + model_type = model_type.lower() + assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type" + assert model_type in [ + "distilbert", + "bert", + "mobilebert", + "roberta", + ], "Unsupported \ + model type" + self.dynamic_length = dynamic_length + self.model_type = model_type + self.max_seq_length = max_seq_length + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) + self.dataset = load_and_cache_examples( + data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate + ) + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, index): + batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in self.dataset[index]) + return batch[: len(self.inputs)], batch[-1] + + +def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate): + processor = transformers.glue_processors[task]() + output_mode = transformers.glue_output_modes[task] + # Load data features from cache or dataset file + if not os.path.exists("./dataset_cached"): + os.makedirs("./dataset_cached") + cached_features_file = os.path.join( + "./dataset_cached", + "cached_{}_{}_{}_{}".format( + "dev" if evaluate else "train", + list(filter(None, model_name_or_path.split("/"))).pop(), + str(max_seq_length), + str(task), + ), + ) + if os.path.exists(cached_features_file): + logger.info("Load features from cached file {}.".format(cached_features_file)) + features = torch.load(cached_features_file) + else: + logger.info("Create features from dataset file at {}.".format(data_dir)) + label_list = processor.get_labels() + examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) + features = convert_examples_to_features( + examples, + tokenizer, + task=task, + label_list=label_list, + max_length=max_seq_length, + output_mode=output_mode, + ) + logger.info("Save features into cached file {}.".format(cached_features_file)) + torch.save(features, cached_features_file) + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + if output_mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif output_mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + dataset = data.TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_seq_lengths, all_labels) + return dataset + + +def convert_examples_to_features( + examples, + tokenizer, + max_length=128, + task=None, + label_list=None, + output_mode="classification", + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True, +): + processor = transformers.glue_processors[task]() + if label_list is None: + label_list = processor.get_labels() + logger.info("Use label list {} for task {}.".format(label_list, task)) + label_map = {label: i for i, label in enumerate(label_list)} + features = [] + for ex_index, example in enumerate(examples): + inputs = tokenizer.encode_plus( + example.text_a, + example.text_b, + add_special_tokens=True, + max_length=max_length, + return_token_type_ids=True, + truncation=True, + ) + input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + seq_length = len(input_ids) + padding_length = max_length - len(input_ids) + + input_ids = input_ids + ([pad_token] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) + + assert len(input_ids) == max_length, "Error with input_ids length {} vs {}".format(len(input_ids), max_length) + assert len(attention_mask) == max_length, "Error with attention_mask length {} vs {}".format( + len(attention_mask), max_length + ) + assert len(token_type_ids) == max_length, "Error with token_type_ids length {} vs {}".format( + len(token_type_ids), max_length + ) + if output_mode == "classification": + label = label_map[example.label] + elif output_mode == "regression": + label = float(example.label) + else: + raise KeyError(output_mode) + + feats = InputFeatures( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label, + seq_length=seq_length, + ) + features.append(feats) + return features + + +@dataclasses.dataclass(frozen=True) +class InputFeatures: + """ + A single set of features of data. + Property names are the same names as the corresponding inputs to a model. + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, + ``0`` for MASKED (padded) tokens. + token_type_ids: (Optional) Segment token indices to indicate first and second + portions of the inputs. Only some models use them. + label: (Optional) Label corresponding to the input. Int for classification problems, + float for regression problems. + seq_length: (Optional) The length of input sequence before padding. + """ + + input_ids: List[int] + attention_mask: Optional[List[int]] = None + token_type_ids: Optional[List[int]] = None + label: Optional[Union[int, float]] = None + seq_length: Optional[List[int]] = None + + +class ONNXRTGLUE: + """Computes GLUE score. + + Args: + task (str, default=mrpc): The name of the task. + Choices include mrpc, qqp, qnli, rte, + sts-b, cola, mnli, wnli. + + """ + + def __init__(self, task="mrpc"): + assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type" + self.pred_list = None + self.label_list = None + self.task = task + self.return_key = { + "cola": "mcc", + "mrpc": "acc", + "sts-b": "corr", + "qqp": "acc", + "mnli": "mnli/acc", + "qnli": "acc", + "rte": "acc", + "wnli": "acc", + "sst-2": "acc", + } + + def update(self, preds, labels): + """add preds and labels to storage""" + if isinstance(preds, list) and len(preds) == 1: + preds = preds[0] + if isinstance(labels, list) and len(labels) == 1: + labels = labels[0] + if self.pred_list is None: + self.pred_list = preds + self.label_list = labels + else: + self.pred_list = np.append(self.pred_list, preds, axis=0) + self.label_list = np.append(self.label_list, labels, axis=0) + + def reset(self): + """clear preds and labels storage""" + self.pred_list = None + self.label_list = None + + def result(self): + """calculate metric""" + output_mode = transformers.glue_output_modes[self.task] + if output_mode == "classification": + processed_preds = np.argmax(self.pred_list, axis=1) + elif output_mode == "regression": + processed_preds = np.squeeze(self.pred_list) + result = transformers.glue_compute_metrics(self.task, processed_preds, self.label_list) + return result[self.return_key[self.task]] + + +class DataReader(data_reader.CalibrationDataReader): + def __init__(self, model_path, dynamic_length=False, batch_size=1, calibration_sampling_size=8): + self.encoded_list = [] + self.batch_size = batch_size + dataset = ONNXRTBertDataset( + args.model_path, + data_dir=args.data_path, + model_name_or_path=args.model_name_or_path, + max_seq_length=args.max_seq_length, + task=args.task, + model_type=args.model_type, + dynamic_length=args.dynamic_length, + ) + dataloader = data.DataLoader( + dataset, + sampler=data.SequentialSampler(dataset), + batch_size=self.batch_size, + shuffle=False, + ) + model = onnx.load(model_path, load_external_data=False) + inputs_names = [input.name for input in model.graph.input] + self.batch_size = batch_size + + for idx, batch in enumerate(dataloader): + if idx + 1 > calibration_sampling_size: + break + ort_input = {} + batch_seq_length = args.max_seq_length if not args.dynamic_length else torch.max(batch[0][-2], 0)[0].item() + batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0]) + + for name, inputs in zip(inputs_names, batch): + ort_input[name] = inputs[:, :batch_seq_length] + + self.encoded_list.append(ort_input) + + self.iter_next = iter(self.encoded_list) + + def get_next(self): + return next(self.iter_next, None) + + def rewind(self): + self.iter_next = iter(self.encoded_list) + + +if __name__ == "__main__": + # set config for npu test + provider = "DmlExecutionProvider" if args.device == "npu" else "CPUExecutionProvider" + + dataset = ONNXRTBertDataset( + args.model_path, + data_dir=args.data_path, + model_name_or_path=args.model_name_or_path, + max_seq_length=args.max_seq_length, + task=args.task, + model_type=args.model_type, + dynamic_length=args.dynamic_length, + ) + dataloader = data.DataLoader( + dataset, + sampler=data.SequentialSampler(dataset), + batch_size=args.batch_size, + shuffle=False, + ) + + def eval_func(model): + metric = ONNXRTGLUE(args.task) + session = onnxruntime.InferenceSession(model, providers=[provider]) + ort_inputs = {} + len_inputs = len(session.get_inputs()) + inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] + + for idx, batch in enumerate(dataloader): + label = batch[-1] + batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0]) + batch_seq_length = args.max_seq_length if not args.dynamic_length else batch[0].shape[-1] + inputs = [ + batch[0][:, :batch_seq_length], + batch[1][:, :batch_seq_length], + batch[2][:, :batch_seq_length], + ] + for i in range(len_inputs): + ort_inputs.update({inputs_names[i]: inputs[i]}) + predictions = session.run(None, ort_inputs) + metric.update(predictions[0], label) + return metric.result() + + if args.benchmark: + if args.mode == "performance": + total_time = 0.0 + num_iter = 100 + num_warmup = 10 + + sess_options = onnxruntime.SessionOptions() + sess_options.intra_op_num_threads = args.intra_op_num_threads + session = onnxruntime.InferenceSession( + args.model_path, sess_options, providers=onnxruntime.get_available_providers() + ) + ort_inputs = {} + len_inputs = len(session.get_inputs()) + inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] + + for idx, batch in enumerate(dataloader): + if idx + 1 > num_iter: + break + batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0]) + batch_seq_length = args.max_seq_length if not args.dynamic_length else batch[0].shape[-1] + inputs = [ + batch[0][:, :batch_seq_length], + batch[1][:, :batch_seq_length], + batch[2][:, :batch_seq_length], + ] + for i in range(len_inputs): + ort_inputs.update({inputs_names[i]: inputs[i]}) + tic = time.time() + predictions = session.run(None, ort_inputs) + toc = time.time() + if idx >= num_warmup: + total_time += toc - tic + + print("\n", "-" * 10, "Summary:", "-" * 10) + print(args) + throughput = (num_iter - num_warmup) / total_time + print("Throughput: {} samples/s".format(throughput)) + elif args.mode == "accuracy": + acc_result = eval_func(args.model_path) + print("Batch size = %d" % args.batch_size) + print("Accuracy: %.5f" % acc_result) + + if args.tune: + # optimize model + with tempfile.TemporaryDirectory(prefix="ort.opt.") as tmp_dir: + opt_options = FusionOptions("bert") + opt_options.enable_embed_layer_norm = False + + model_optimizer = optimizer.optimize_model( + args.model_path, "bert", num_heads=12, hidden_size=768, optimization_options=opt_options + ) + model = model_optimizer.model + + # check the optimized model is valid + try: + onnxruntime.InferenceSession(model.SerializeToString(), providers=onnxruntime.get_available_providers()) + onnx.save(model, pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix()) + model = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix() + except Exception as e: + logger.warning("Optimized model is invalid: {}. ".format(e)) + logger.warning("Model optimizer will be skipped. " "Try to upgrade onnxruntime to avoid this error") + model = args.model_path + + calibration_data_reader = DataReader(args.model_path, calibration_sampling_size=8) + custom_tune_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig.get_config_set_for_tuning( + quant_format=( + quantization.QuantFormat.QOperator + if args.quant_format == "QOperator" + else quantization.QuantFormat.QDQ + ), + calibration_sampling_size=8, + op_types_to_quantize=["MatMul"], + extra_options={"OpTypesToExcludeOutputQuantization": ["MatMul"]}, + execution_provider=provider, + ) + ) + best_model = tuning.autotune( + model_input=model, + tune_config=custom_tune_config, + eval_fn=eval_func, + calibration_data_reader=calibration_data_reader, + optimization_level=onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL, + ) + onnx.save(best_model, args.output_model) diff --git a/examples/nlp/bert/quantization/ptq_static/prepare_data.sh b/examples/nlp/bert/quantization/ptq_static/prepare_data.sh new file mode 100644 index 000000000..c1fddb546 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_static/prepare_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + download_data + +} + +# init params +function init_params { + + for var in "$@" + do + case $var in + --data_dir=*) + data_dir=$(echo "$var" |cut -f2 -d=) + ;; + --task_name=*) + task_name=$(echo "$var" |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function download_data { + wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py + python download_glue_data.py --data_dir="${data_dir}" --tasks="${task_name}" +} + +main "$@" + diff --git a/examples/nlp/bert/quantization/ptq_static/prepare_model.py b/examples/nlp/bert/quantization/ptq_static/prepare_model.py new file mode 100644 index 000000000..5b9216640 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_static/prepare_model.py @@ -0,0 +1,118 @@ +import argparse +import os +import sys +import urllib +import zipfile + +import torch +import transformers + +# Please refer to [Bert-GLUE_OnnxRuntime_quantization guide] +# (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb) +# for detailed model export. + +MODEL_URL = "https://download.pytorch.org/tutorial/MRPC.zip" +MAX_TIMES_RETRY_DOWNLOAD = 5 + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--input_model", type=str, required=False, default="MRPC.zip") + parser.add_argument("--output_model", type=str, required=True) + parser.add_argument("--max_len", type=int, default=128, help="Maximum length of the sentence pairs") + return parser.parse_args() + + +def progressbar(cur, total=100): + percent = "{:.2%}".format(cur / total) + sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent)) + sys.stdout.flush() + + +def schedule(blocknum, blocksize, totalsize): + if totalsize == 0: + percent = 0 + else: + percent = min(1.0, blocknum * blocksize / totalsize) * 100 + progressbar(percent) + + +def is_zip_file(filename): + try: + with open(filename, "rb") as f: + magic_number = f.read(4) + return magic_number == b"PK\x03\x04" # ZIP file magic number + except OSError: + return False + + +def extrafile(filename, target_folder="."): + with zipfile.ZipFile(filename, "r") as zin: + zin.extractall(target_folder) + + +def download_model(url, model_name, retry_times=5): + if os.path.isdir(model_name): + return model_name + elif os.path.exists(model_name) and is_zip_file(model_name): + print("file downloaded") + extrafile(model_name) + return True + + print("download model...") + retries = 0 + while retries < retry_times: + try: + urllib.request.urlretrieve(url, model_name, schedule) + extrafile(model_name) + break + except KeyboardInterrupt: + return False + except: + retries += 1 + print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}") + return retries < retry_times + + +def export_model(model, output_model, max_len=128): + with torch.no_grad(): + inputs = { + "input_ids": torch.ones(1, max_len, dtype=torch.int64), + "attention_mask": torch.ones(1, max_len, dtype=torch.int64), + "token_type_ids": torch.ones(1, max_len, dtype=torch.int64), + } + + symbolic_names = {0: "batch_size", 1: "max_seq_len"} + torch.onnx.export( + model, # model being run + ( + inputs["input_ids"], + inputs["attention_mask"], + inputs["token_type_ids"], + ), # model input (or a tuple for multiple inputs) + output_model, # where to save the model (can be a file or file-like object) + opset_version=14, # the ONNX version to export the model + do_constant_folding=True, # whether to execute constant folding + input_names=["input_ids", "input_mask", "segment_ids"], # the model's input names + output_names=["output"], # the model's output names + dynamic_axes={ + "input_ids": symbolic_names, # variable length axes + "input_mask": symbolic_names, + "segment_ids": symbolic_names, + }, + ) + assert os.path.exists(output_model), f"Export failed! {output_model} doesn't exist!" + print("ONNX Model exported to {0}".format(output_model)) + + +def prepare_model(input_model, output_model, max_len): + is_download_successful = download_model(MODEL_URL, input_model, MAX_TIMES_RETRY_DOWNLOAD) + if is_download_successful: + folder_name = is_download_successful if isinstance(is_download_successful, str) else "./MRPC" + model = transformers.BertForSequenceClassification.from_pretrained(folder_name) + export_model(model, output_model, max_len) + + +if __name__ == "__main__": + args = parse_arguments() + prepare_model(args.input_model, args.output_model, args.max_len) diff --git a/examples/nlp/bert/quantization/ptq_static/requirements.txt b/examples/nlp/bert/quantization/ptq_static/requirements.txt new file mode 100644 index 000000000..85dc725a4 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_static/requirements.txt @@ -0,0 +1,8 @@ +torch +transformers +accelerate +onnx +onnxruntime +coloredlogs +sympy +onnxruntime-extensions diff --git a/examples/nlp/bert/quantization/ptq_static/run_benchmark.sh b/examples/nlp/bert/quantization/ptq_static/run_benchmark.sh new file mode 100644 index 000000000..465524f04 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_static/run_benchmark.sh @@ -0,0 +1,64 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo "$var" |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo "$var" |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo "$var" |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo "$var" |cut -f2 -d=) + ;; + --intra_op_num_threads=*) + intra_op_num_threads=$(echo "$var" |cut -f2 -d=) + ;; + esac + done + +} + +# run_benchmark +function run_benchmark { + if [[ ${mode} == "accuracy" ]]; then + dynamic_length=False + elif [[ ${mode} == "performance" ]]; then + dynamic_length=True + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + model_name_or_path="bert-base-uncased" + task_name="mrpc" + + python main.py \ + --model_path "${input_model}" \ + --model_name_or_path "${model_name_or_path}" \ + --data_path "${dataset_location}" \ + --task "${task_name}" \ + --batch_size "${batch_size}" \ + --mode "${mode}" \ + --intra_op_num_threads "${intra_op_num_threads-4}" \ + --dynamic_length "${dynamic_length}" \ + --benchmark + +} + +main "$@" + diff --git a/examples/nlp/bert/quantization/ptq_static/run_quant.sh b/examples/nlp/bert/quantization/ptq_static/run_quant.sh new file mode 100644 index 000000000..976e8e0c2 --- /dev/null +++ b/examples/nlp/bert/quantization/ptq_static/run_quant.sh @@ -0,0 +1,53 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_tuning +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo "$var" |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo "$var" |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo "$var" |cut -f2 -d=) + ;; + --quant_format=*) + quant_format=$(echo "$var" |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + model_name_or_path="bert-base-uncased" + batch_size=8 + task_name="mrpc" + model_type="bert" + + python main.py \ + --model_path "${input_model}" \ + --output_model "${output_model}" \ + --model_name_or_path "${model_name_or_path}" \ + --data_path "${dataset_location}" \ + --task "${task_name}" \ + --batch_size "${batch_size}" \ + --model_type "${model_type}" \ + --quant_format "${quant_format}" \ + --tune +} + +main "$@" + + + diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py index 9cafe62d3..572e1f010 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py @@ -33,8 +33,8 @@ from torch.nn import functional from torch.utils import data -from onnx_neural_compressor import config, data_reader, logger, utility -from onnx_neural_compressor.quantization import matmul_nbits_quantizer, tuning +from onnx_neural_compressor import data_reader +from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN @@ -315,10 +315,6 @@ def rewind(self): if __name__ == "__main__": - utility.set_workspace(args.workspace) - if not os.path.exists(args.workspace): - os.mkdir(args.workspace) - if args.benchmark: if args.mode == "performance": benchmark(args.model_path) @@ -331,23 +327,11 @@ def rewind(self): model_name = "model.onnx" # require optimum >= 1.14.0 model_path = os.path.join(args.model_path, model_name) - # do graph optimization - logger.info("Start graph optimization...") - sess_options = ort.SessionOptions() - sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED - sess_options.optimized_model_filepath = os.path.join(args.workspace, "Optimized_model.onnx") - sess_options.add_session_config_entry( - "session.optimized_model_external_initializers_file_name", "Optimized_model.onnx_data" - ) - sess_options.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "1024") - sess = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"]) - logger.info("Graph optimization done.") - best_model = None if args.algorithm.upper() == "RTN": algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig() quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( - sess_options.optimized_model_filepath, + model_path, n_bits=4, block_size=32, is_symmetric=True, @@ -362,7 +346,7 @@ def rewind(self): calibration_data_reader=calibration_data_reader, enable_mse_search=False ) quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( - sess_options.optimized_model_filepath, + model_path, n_bits=4, block_size=32, is_symmetric=True, @@ -377,7 +361,7 @@ def rewind(self): calibration_data_reader=calibration_data_reader, ) quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( - sess_options.optimized_model_filepath, + model_path, n_bits=4, block_size=32, is_symmetric=False, diff --git a/onnx_neural_compressor/__init__.py b/onnx_neural_compressor/__init__.py index a8e492104..2175e2eba 100644 --- a/onnx_neural_compressor/__init__.py +++ b/onnx_neural_compressor/__init__.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -# # Copyright (c) 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/onnx_neural_compressor/algorithms/layer_wise/core.py b/onnx_neural_compressor/algorithms/layer_wise/core.py index 2e381cfdb..b4a665f8c 100644 --- a/onnx_neural_compressor/algorithms/layer_wise/core.py +++ b/onnx_neural_compressor/algorithms/layer_wise/core.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -# # Copyright (c) 2023 MIT HAN Lab # This source code is licensed under the MIT license # @@ -24,7 +22,7 @@ import onnx import onnxruntime as ort -from onnx_neural_compressor import data_reader, logger, onnx_model, utility +from onnx_neural_compressor import data_reader, logger, onnx_model from typing import Callable, List, Union # isort: skip @@ -49,7 +47,7 @@ def layer_wise_quant( _type_: _description_ """ # check whether model shape is inferred - if not utility.check_model_with_infer_shapes(model): + if not _check_model_with_infer_shapes(model): logger.error( "Before applying layer-wise quantization, please make sure to " "run symbolic shape inference on your model like follows:\n" @@ -277,3 +275,14 @@ def _prepare_data_reader_for_next_split_model( inputs.update({name: value for name, value in zip(output_names, out)}) data_reader_for_next_split_model.append(inputs) return DataReader(data_reader_for_next_split_model) + + +def _check_model_with_infer_shapes(model): + """Check if the model has been shape inferred.""" + if isinstance(model, (pathlib.Path, str)): + model = onnx.load(model, load_external_data=False) + elif isinstance(model, onnx_model.ONNXModel): + model = model.model + if len(model.graph.value_info) > 0: + return True + return False diff --git a/onnx_neural_compressor/algorithms/post_training_quant/__init__.py b/onnx_neural_compressor/algorithms/post_training_quant/__init__.py new file mode 100644 index 000000000..28f108cb6 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py new file mode 100644 index 000000000..095897b49 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py @@ -0,0 +1,637 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft, Intel Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- +"""Calibration for onnx models.""" + +import copy +import logging +import os +import sys +from importlib import util + +import numpy as np +import onnx +import onnxruntime +from packaging import version + +from onnx_neural_compressor import logger, onnx_model +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant import calibrator + +if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): + import onnxruntime_extensions + +ONNX18_VERSION = version.Version("1.8.0") +ORT112_VERSION = version.Version("1.12.0") + + +class ONNXRTAugment: + """Augment input model to dump tensor or for calibration.""" + + def __init__( + self, + model_wrapper, + dataloader, + dump_op_types, + black_nodes=[], + white_nodes=[], + iterations=[], + execution_provider="CPUExecutionProvider", + reduce_range=False, + **kwargs, + ): + """Initialization. + + Args: + model_wrapper (Model): model to be augmented + dataloader (object): user implemented object to read in and preprocess calibration dataset + dump_op_types (list): operator types to be calibrated and quantized + black_nodes (list, optional): operator names that should not be quantized. Defaults to []. + white_nodes (list, optional): operator names that force to be quantized. Defaults to []. + iterations (list, optional): tensor of which iteration will be collected. Defaults to []. + execution_provider (list, optional): execution provider for onnxruntime. Defaults to 'CPUExecutionProvider'. + reduce_range (bool, optional): use 7 bit or not. Defaults to False. + """ + self.model_wrapper = ( + model_wrapper + if isinstance(model_wrapper, onnx_model.ONNXModel) + else onnx_model.ONNXModel(model_wrapper, load_external_data=True) + ) + self.model = self.model_wrapper.model + ai_onnx_domain = [opset for opset in self.model.opset_import if not opset.domain or opset.domain == "ai.onnx"] + self.opset_version = ai_onnx_domain[0].version + self.dataloader = dataloader + self.dump_op_types = dump_op_types + self.black_nodes = black_nodes + self.white_nodes = white_nodes + self.augmented_model = None + self.iterations = iterations + self.execution_provider = execution_provider + self.augment_nodes = [] + self.dequantized_output = {} + self.already_quantized = "DequantizeLinear" in [node.op_type for node in self.model.graph.node] + self.dynamically_quantized = False + self.ort_version = version.Version(onnxruntime.__version__) + self.reduce_range = reduce_range + + def augment_graph(self): + """Augment_graph. + + Adds nodes to all quantization_candidates op type nodes in model and + ensures their outputs are stored as part of the graph output. + + Args: + activation_only (bool, optional): whether to dump activation tensor only. Defaults to False. + weight_only (bool, optional): whether to dump weight_only. Defaults to False. + """ + self.dequantized_output.clear() + onnx_version = version.Version(onnx.__version__) + if onnx_version < ONNX18_VERSION: + logger.warning("Static quantization for NLP model is supported at onnx 1.8.0 and newer.") + if self.already_quantized and any( + [i.dims in [1, 2] for i in self.model_wrapper.initializer() if i.name.endswith("_scale")] + ): + if self.opset_version < 13 and self.ort_version >= ORT112_VERSION: + logger.warning( + "Please use onnxruntime < 1.12.0 or upgrade model opset " + "version to 13 or higher to inspect per-channel quantized weight" + ) + + model = copy.deepcopy(self.model) + model_nodes_names = [node.name for node in model.graph.node] + + added_nodes = [] + added_outputs = [] + tensors_to_dump = set() + + for augment_node_type in self.augment_nodes: + if augment_node_type not in ["DequantizeLinear"]: # pragma: no cover + raise ValueError( + "Unexpected augment_node {} only DequantizeLinear is supported".format(augment_node_type) + ) + + if self.already_quantized: + # mapping between fp32 node and int8 node + new_white_nodes = [] + for white_node in self.white_nodes: + new_white_node = white_node + "_quant" + assert new_white_node in model_nodes_names, "no quantized {} in the graph".format(white_node) + new_white_nodes.append(new_white_node) + self.white_nodes = new_white_nodes + + node_outputs = [] + for node in model.graph.node: # pylint: disable=no-member + node_outputs.extend(node.output) + should_be_dump = ((node.op_type in self.dump_op_types) and (node.name not in self.black_nodes)) or ( + node.name in self.white_nodes + ) + if should_be_dump: + # add input tensors which should be dump + for input in node.input: + if len(input) != 0: # to prevent input is "" + initializer_tensor = self.model_wrapper.get_initializer(input) + if initializer_tensor is None: + tensors_to_dump.add(input) + # add output tensors which should be dump + tensors_to_dump.update([output for output in node.output if len(output) != 0]) + + model_inputs = [i.name for i in model.graph.input] + for tensor in tensors_to_dump: + if tensor not in node_outputs and tensor not in model_inputs: + continue + if self.augment_nodes: + for augment_node_type in self.augment_nodes: + if augment_node_type in ["DequantizeLinear"]: + # insert DequantizeLinear node as output + if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover + continue + + if not self.dynamically_quantized: + tensor = ( + tensor.replace("_QuantizeInput", "_quantized") + if tensor.endswith("_QuantizeInput") + else tensor + ) + else: + tensor = ( + tensor.replace("_output_quantized", "") + if tensor.endswith("_output_quantized") + else tensor + ) + + augment_node_name = tensor + "_new_" + augment_node_type + scale, zero_point = self.model_wrapper.get_scale_zero(tensor) + if scale: + # the tensor is in INT8 dtype + nodes, output = self._dequantize(tensor, scale, zero_point) + if output: + added_nodes.extend(nodes) + added_outputs.append( + onnx.helper.make_tensor_value_info( + output, onnx.TensorProto.FLOAT, () # pylint: disable=no-member + ) + ) # pylint: disable=no-member + else: + # the tensor is in FP32 dtype + if tensor not in [t.name for t in model.graph.output]: + added_tensor = onnx.helper.ValueInfoProto() + added_tensor.name = tensor + added_outputs.append(added_tensor) + else: + if tensor not in [t.name for t in model.graph.output]: + added_tensor = onnx.helper.ValueInfoProto() + added_tensor.name = tensor + added_outputs.append(added_tensor) + + if self.augment_nodes: + model.graph.node.extend(added_nodes) # pylint: disable=no-member + model.graph.output.extend(added_outputs) # pylint: disable=no-member + + self.augmented_model = model + if self.model_wrapper.is_large_model: # pragma: no cover + onnx.save_model( + model, + self.model_wrapper.model_path + "_augment.onnx", + save_as_external_data=True, + all_tensors_to_one_file=True, + convert_attribute=False, + ) + + def get_activation_tensors_calib_range(self, q_config=None): + """Get calib ranges of activation tensors. + + Args: + q_config (dict, optional): quantization config. Defaults to None. + + Returns: + dict: calib ranges + """ + # conduct inference session and get intermediate outputs + so = onnxruntime.SessionOptions() + so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL + if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): + so.register_custom_ops_library(onnxruntime_extensions.get_library_path()) + + execution_provider = ( + self.execution_provider + if self.execution_provider != "TensorrtExecutionProvider" + else "CUDAExecutionProvider" + ) + session = ( + onnxruntime.InferenceSession(self.augmented_model.SerializeToString(), so, providers=[execution_provider]) + if not self.model_wrapper.is_large_model + else onnxruntime.InferenceSession( + self.model_wrapper.model_path + "_augment.onnx", so, providers=[execution_provider] + ) + ) + + len_inputs = len(session.get_inputs()) + inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] + len_outputs = len(session.get_outputs()) + outputs_names = [session.get_outputs()[i].name for i in range(len_outputs)] + + node_output_names = [ + output.name if output.name not in self.dequantized_output else self.dequantized_output[output.name] + for output in session.get_outputs() + ] + augment_model_wrapper = ( + onnx_model.ONNXModel(self.augmented_model, load_external_data=False) + if not self.model_wrapper.is_large_model + else onnx_model.ONNXModel(self.model_wrapper.model_path + "_augment.onnx", load_external_data=False) + ) + input_name_to_nodes = augment_model_wrapper.input_name_to_nodes() + output_name_to_node = augment_model_wrapper.output_name_to_node() + name_to_node = {} + for data_name in node_output_names: + node = None + if data_name in output_name_to_node: + node = output_name_to_node[data_name] + elif data_name in input_name_to_nodes: + node = input_name_to_nodes[data_name][0] + assert node, "{} is neither an input nor an output of nodes in augmented model.".format(data_name) + name_to_node[data_name] = node.name + + activation_tensors_calib_range = {} + intermediate_tensor = {} + name_to_calibrator = {} + ort_inputs_for_next_split_model = [] + + def _collect_data(inputs): + for output_idx, output in enumerate(session.run(None, inputs)): + if q_config is not None and output.size != 0: + node_name = name_to_node[node_output_names[output_idx]] + if node_output_names[output_idx] not in name_to_calibrator: + calib_method = ( + q_config[node_name]["calibrate_method"] if q_config and node_name in q_config else "MinMax" + ) + assert calib_method in calibrator.CALIBRATOR, "Calibration method {} is not registered.".format( + calib_method + ) + _calibrator = calibrator.CALIBRATOR[calib_method]() + else: + _calibrator = name_to_calibrator[node_output_names[output_idx]] + + # currently, the calibration range for each iteration is collected if + # the calibration method is minmax, otherwise the tensor data is collected. + # TODO: for entropy and percentile method, need to support range collection + # per iteration in the future. + if _calibrator.method_name == "MinMax": + _calibrator.collect(output) + activation_tensors_calib_range[node_output_names[output_idx]] = [list(_calibrator.calib_range)] + name_to_calibrator[node_output_names[output_idx]] = _calibrator + else: + intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append(output) + elif q_config is None: + activation_tensors_calib_range.setdefault(node_output_names[output_idx], []).append(output) + + idx = 0 + while True: + inputs = self.dataloader.get_next() + if not inputs: + break + if self.iterations != []: + if idx > max(self.iterations): + break + if idx in self.iterations: + _collect_data(inputs) + else: + _collect_data(inputs) + idx += 1 + + # for entropy and percentile method, collect calibration range after all tensors are collected. + merged_dict = intermediate_tensor + for (output_name, node_name), datas in merged_dict.items(): + if any([data is None for data in datas]): + continue + if any([data.dtype in [bool] for data in datas]): # output type of some ops is bool, skip + continue + calib_method = q_config[node_name]["calibrate_method"] if q_config and node_name in q_config else 0 + _calibrator = calibrator.CALIBRATOR[calib_method]() + _calibrator.collect(datas) + activation_tensors_calib_range.setdefault(output_name, []).append(list(_calibrator.calib_range)) + _calibrator.clear() + del _calibrator + + return activation_tensors_calib_range + + def get_weight_tensors_calib_range(self): + """Get calib ranges of weight tensors. + + Returns: + dict: calib ranges + """ + model_nodes_names = [node.name for node in self.model.graph.node] + + # if augmented_model is not None, it means self.white_nodes is already updated in augment_graph func + # then skip update here + if self.already_quantized and self.augmented_model is None: + # mapping between fp32 node and int8 node + new_white_nodes = [] + for white_node in self.white_nodes: + new_white_node = white_node + "_quant" + assert new_white_node in model_nodes_names, "no quantized {} in the " "graph".format(white_node) + new_white_nodes.append(new_white_node) + self.white_nodes = new_white_nodes + + added_outputs = set() + initializer_tensors_to_dump = [] + initializers = [init.name for init in self.model.graph.initializer] + for node in self.model.graph.node: # pylint: disable=no-member + should_be_dump = ((node.op_type in self.dump_op_types) and (node.name not in self.black_nodes)) or ( + node.name in self.white_nodes + ) + if should_be_dump: + for input in node.input: + if ( + (self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) + or (not self.already_quantized and input in initializers) + ) and len(input) != 0: + added_outputs.add(input) + + for tensor in added_outputs: + if tensor not in initializers: + continue + if self.augment_nodes: + for augment_node_type in self.augment_nodes: + if augment_node_type in ["DequantizeLinear"]: + if not (tensor.endswith("_scale") or tensor.endswith("_zero_point")): + initializer_tensors_to_dump.append(tensor) + else: + initializer_tensors_to_dump.append(tensor) + + weight_tensors_calib_range = {} + for initializer_tensor_name in initializer_tensors_to_dump: + initializer_tensor = self.model_wrapper.get_initializer(initializer_tensor_name) + + # double check initializer tensor is not None + if initializer_tensor is None: # pragma: no cover + continue + + initializer_tensor = onnx.numpy_helper.to_array( + initializer_tensor, + base_dir=( + os.path.dirname(self.model_wrapper.model_path) if self.model_wrapper.model_path is not None else "" + ), + ) + _calibrator = calibrator.CALIBRATOR["MinMax"]() # use minmax method to calibrate initializer tensors + if initializer_tensor.flatten().size > 0: + _calibrator.collect(initializer_tensor) + weight_tensors_calib_range[initializer_tensor_name] = [list(_calibrator.calib_range)] + _calibrator.clear() + del _calibrator + return weight_tensors_calib_range + + def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False): + """Gather intermediate model outputs after running inference.""" + output_dicts = {} + if not activation_only and not weight_only: + output_dicts = self.get_activation_tensors_calib_range(q_config) + output_dicts.update(self.get_weight_tensors_calib_range()) + elif weight_only: + output_dicts = self.get_weight_tensors_calib_range() + elif activation_only: + output_dicts = self.get_activation_tensors_calib_range(q_config) + + return list(output_dicts.keys()), output_dicts + + def _dequantize(self, tensor, scale_tensor, zo_tensor): + """Helper function to dequantize tensor.""" + int_tensor = self.model_wrapper.get_initializer(tensor) + if int_tensor: # weight tensor + return self._dequantize_weight(tensor, scale_tensor, zo_tensor) + else: + return self._dequantize_activation(tensor, scale_tensor, zo_tensor) + + def _dequantize_activation(self, activation_tensor_name, scale_tensor, zo_tensor): + """Helper function to dequantize activation.""" + added_nodes, added_output = self._add_dequantize_node(activation_tensor_name, scale_tensor, zo_tensor) + self.dequantized_output[added_output] = activation_tensor_name + return added_nodes, added_output + + def _dequantize_weight(self, weight_tensor_name, scale_tensor, zo_tensor): + """Helper function to dequantize weight.""" + weight_tensor = self.model_wrapper.get_initializer(weight_tensor_name) + if len(scale_tensor.dims) in [1, 2] and weight_tensor.dims[0] == max(scale_tensor.dims): + logger.debug("weight {} is quantized with per channel granularity.".format(weight_tensor_name)) + if self.opset_version < 13 and self.ort_version >= ORT112_VERSION: + logger.warning( + "Skip dequantizing weight {}, please use onnxruntime < 1.12.0 " + "or upgrade model opset version to 13 or higher".format(weight_tensor_name) + ) + return [], None + node = self.model_wrapper.input_name_to_nodes()[weight_tensor_name][0] + if "Conv" in node.op_type or ("Gemm" in node.op_type and quant_utils.is_B_transposed(node)): + added_nodes, added_output = self._add_dequantize_transpose_node( + weight_tensor_name, scale_tensor, zo_tensor, len(weight_tensor.dims) + ) + else: + added_nodes, added_output = self._add_dequantize_node( + weight_tensor_name, scale_tensor, zo_tensor, axis=1 if self.opset_version > 12 else None + ) + else: + added_nodes, added_output = self._add_dequantize_node(weight_tensor_name, scale_tensor, zo_tensor) + self.dequantized_output[added_output] = weight_tensor_name + return added_nodes, added_output + + def _add_dequantize_node(self, tensor_name, scale_tensor, zo_tensor, axis=None): + """Helper function to generate dequantize node.""" + dequantize_node = onnx.helper.make_node( + "DequantizeLinear", + [tensor_name, scale_tensor.name, zo_tensor.name], + [tensor_name + "_output"], + tensor_name + "_DequantizeLinear", + axis, + ) + return [dequantize_node], tensor_name + "_output" + + def _add_dequantize_transpose_node(self, tensor_name, scale_tensor, zo_tensor, dim): + """Insert Transpose-DequantizelLinear-Transpose pairs.""" + pre_transpose_node = onnx.helper.make_node( + "Transpose", + inputs=[tensor_name], + outputs=[tensor_name + "_transposed"], + perm=(1, 0, 2, 3) if dim == 4 else (1, 0), + name=tensor_name + "_pre_transpose", + ) + dequantize_node = onnx.helper.make_node( + "DequantizeLinear", + [tensor_name + "_transposed", scale_tensor.name, zo_tensor.name], + [tensor_name + "_DequantizeLinear"], + tensor_name + "_DequantizeLinear", + axis=1 if self.opset_version > 12 else None, + ) + post_transpose_node = onnx.helper.make_node( + "Transpose", + inputs=[tensor_name + "_DequantizeLinear"], + outputs=[tensor_name + "_output"], + perm=(1, 0, 2, 3) if dim == 4 else (1, 0), + name=tensor_name + "_post_transpose", + ) + added_nodes = [pre_transpose_node, dequantize_node, post_transpose_node] + return added_nodes, tensor_name + "_output" + + def _map_calibration(self, node_output_names, output_dicts): + """Map tensor names and min/max values.""" + merged_dict = {} + for name, minmaxs in output_dicts.items(): + for minmax in minmaxs: + if len(minmax) < 2: + continue + merged_dict.setdefault(name + "_Min", []).append(minmax[0]) + merged_dict.setdefault(name + "_Max", []).append(minmax[1]) + + # Characterizing distribution of a node's values across test data sets + clean_merged_dict = dict((i, merged_dict[i]) for i in merged_dict) + pairs = [ + tuple([float(min(clean_merged_dict[name + "_Min"])), float(max(clean_merged_dict[name + "_Max"]))]) + for name in node_output_names + ] + + final_dict = dict(zip(node_output_names, pairs)) + return final_dict + + def dump_minmax(self, q_config): + """Get calib ranges of tensors.""" + # pipeline of getting calib ranges of tensors during calibration: + # 1. augment_graph(): insert activation tensors to model output + # 2. get_intermediate_outputs(): + # 2.1 get_activation_tensors_calib_range(): get calib ranges of activation tensors using the augment graph + # 2.2 get_weight_tensors_calib_range(): get calib ranges of weight tensors + self.augment_graph() + node_output_names, output_dicts = self.get_intermediate_outputs(q_config) + return self._map_calibration(node_output_names, output_dicts) + + def dump_calibration(self, q_config, min_max=None): + """Gather calibration params for quantization. + + Args: + q_config (dict): op-wise quantization config + min_max (dict, optional): min/max values of tensors + """ + return ( + self.calculate_quantization_params(q_config, self.dump_minmax(q_config)) + if min_max is None + else self.calculate_quantization_params(q_config, min_max) + ) + + def calculate_quantization_params(self, q_config, quantization_thresholds): + """Given quantization thresholds, calculate the quantization params. + + Args: + q_config (dict): op-wise quantization config + quantization_thresholds (dict): Dictionary specifying the min and max values + or outputs of conv and matmul nodes, should be + specified in the following format: + {"param_name": [min, max]} + """ + if quantization_thresholds is None: + raise ValueError( + "quantization thresholds is required to calculate quantization \ + params (zero point and scale)" + ) + + quantization_params = {} + model = self.model + + input_name_to_nodes = self.model_wrapper.input_name_to_nodes() + output_name_to_node = self.model_wrapper.output_name_to_node() + + for tensor_name in quantization_thresholds.keys(): + child = None + if tensor_name in input_name_to_nodes: + children = input_name_to_nodes[tensor_name] + if len(children) == 1: + child = children[0] + parent = None + sym = False + qType = 2 # uint8 + + # input and output tensor follow activation_type and activation_sym + if tensor_name in input_name_to_nodes and any( + [i.name in q_config for i in input_name_to_nodes[tensor_name]] + ): + for child in input_name_to_nodes[tensor_name]: + if child.name in q_config and q_config[child.name] not in ["fp32", "fp16", "bf16"]: + sym = q_config[child.name]["activation_sym"] + qType = q_config[child.name]["activation_type"] + break + elif ( + tensor_name in output_name_to_node + and output_name_to_node[tensor_name].name in q_config + and q_config[output_name_to_node[tensor_name].name] not in ["fp32", "fp16", "bf16"] + ): + sym = q_config[output_name_to_node[tensor_name].name]["activation_sym"] + qType = q_config[output_name_to_node[tensor_name].name]["activation_type"] + if self.execution_provider in ["TensorrtExecutionProvider"]: + # TensorrtExecutionProvider only support int8 + qType = 3 + node_thresholds = quantization_thresholds[tensor_name] + node_params = self.calculate_scale_zeropoint( + parent, + child, + node_thresholds[0], + node_thresholds[1], + sym, + qType, + ) + quantization_params[tensor_name] = node_params + + return quantization_params + + def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType): + """Given the source and destination node of tensor, return calculated zero point and scales.""" + zp_and_scale = [] + # adjust rmin and rmax such that 0 is included in the range. This is required + # to make sure zero can be uniquely represented. + rmin = min(rmin, 0) + rmax = max(rmax, 0) + if next_node: + if next_node.op_type == "Relu": + if rmin < 0: + rmin = 0 + elif next_node.op_type == "Clip" and len(next_node.input) == 3: + if self.model_wrapper.get_initializer(next_node.input[1]) is not None: + clip_min = onnx.numpy_helper.to_array(self.model_wrapper.get_initializer(next_node.input[1])) + if rmin < clip_min: + rmin = clip_min.tolist() if not isinstance(clip_min.tolist(), list) else clip_min.tolist()[0] + if self.model_wrapper.get_initializer(next_node.input[2]) is not None: + clip_max = onnx.numpy_helper.to_array(self.model_wrapper.get_initializer(next_node.input[2])) + if rmax > clip_max: + rmax = clip_max.tolist() if not isinstance(clip_max.tolist(), list) else clip_max.tolist()[0] + + if last_node: + if last_node.op_type in ["Conv", "FusedConv"]: + attrs = [attr for attr in last_node.attribute] + attrs_names = [attr.name for attr in last_node.attribute] + if "activation" in attrs_names: + if attrs[attrs_names.index("activation")].s == b"Relu": + rmin = max(rmin, 0) + if attrs[attrs_names.index("activation")].s == b"Clip": + assert ( + "activation_params" in attrs_names + ), "the model contains no params for clip node {}".format(last_node) + clip_params = attrs[attrs_names.index("activation_params")].floats + rmin = min(rmin, clip_params[0], clip_params[1]) + rmax = max(rmax, clip_params[0], clip_params[1]) + + scale, zp = quant_utils.calculate_scale_zp(rmin, rmax, qType, sym, self.reduce_range) + zp_and_scale.append(zp) + zp_and_scale.append(scale) + + return zp_and_scale diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py new file mode 100644 index 000000000..abef2d323 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py @@ -0,0 +1,401 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft, Intel Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- +"""Calibrator for onnx models.""" + +import copy + +import numpy as np +from scipy import stats + +CALIBRATOR = {} + + +def calib_registry(calib_method): + """The class decorator used to register all Calibrator subclasses.""" + + def decorator_calib(cls): + assert cls.__name__.endswith( + "Calibrator" + ), "The name of subclass of Calibrator should end with 'Calibrator' substring." + if cls.__name__[: -len("Calibrator")] in CALIBRATOR: # pragma: no cover + raise ValueError("Cannot have two operators with the same name.") + CALIBRATOR[calib_method] = cls + return cls + + return decorator_calib + + +class CalibratorBase: + """Base calibrator class.""" + + def __init__(self): + """Initialize base calibrator class.""" + self._calib_min = None + self._calib_max = None + + def collect(self, datas): + """Collect calibration range.""" + self.collect_calib_data(datas) + + def clear(self): + """Clear calibration range.""" + self._calib_min = None + self._calib_max = None + + def collect_calib_data(self, datas): + """Collect calibration range value.""" + raise NotImplementedError + + @property + def calib_range(self): + """Get calibration range value.""" + return self._calib_min, self._calib_max + + +@calib_registry(calib_method="MinMax") +class MinMaxCalibrator(CalibratorBase): + """MinMax calibrator class.""" + + def __init__(self): + """Initialize minmax calibrator class.""" + super(MinMaxCalibrator, self).__init__() + + def collect_calib_data(self, datas): + """Collect calibration range.""" + if isinstance(datas, list) and len(set([data.shape for data in datas])) != 1: + for data in datas: + if data.size == 0: # pragma: no cover + continue + self._collect_value(data) + else: + datas = np.asarray(datas) + datas = datas.flatten() + assert datas.size > 0, "collected intermediate data size" "should not be 0, please check augmented_model" + self._collect_value(datas) + + def _collect_value(self, data): + """Collect min/max value.""" + data = np.asarray(data) + + local_min = np.min(data[np.isinf(data) == False]) # noqa: E712 + local_max = np.max(data[np.isinf(data) == False]) # noqa: E712 + if self._calib_min is None and self._calib_max is None: + self._calib_min = local_min + self._calib_max = local_max + else: + self._calib_min = np.minimum(self._calib_min, local_min) + self._calib_max = np.maximum(self._calib_max, local_max) + + @property + def method_name(self): + """Get calibration method name.""" + return "MinMax" + + +@calib_registry(calib_method="Percentile") +class PercentileCalibrator(CalibratorBase): + """Percentile calibrator class. + + Args: + num_bins (int, optional): number of bins to create a new histogram + for collecting tensor values. Defaults to 2048. + percentile (float, optional): A float number between [0, 100]. Defaults to 99.999. + """ + + def __init__(self, num_bins=2048, percentile=99.999): + """Initialize percentile calibrator class.""" + super(PercentileCalibrator, self).__init__() + self.collector = None + self.num_bins = num_bins + self.percentile = percentile + + def collect_calib_data(self, datas): + """Collect calibration range.""" + if not self.collector: + self.collector = HistogramCollector(self.num_bins) + self.collector.collect_data(datas) + self.compute_percentile_range(self.percentile) + + def compute_percentile_range(self, percentile): + """Compute percentile range.""" + if percentile < 0 or percentile > 100: + raise ValueError("Invalid percentile. Must be in range 0 <= percentile <= 100.") + + calib_hist, calib_bin_edges, min_range, max_range, th = self.collector.histogram + total = calib_hist.sum() + cdf = np.cumsum(calib_hist / total) + percent_to_cut_one_side = (100.0 - percentile) / 200.0 + max_idx = np.searchsorted(cdf, 1.0 - percent_to_cut_one_side) + min_idx = np.searchsorted(cdf, percent_to_cut_one_side) + self._calib_min = calib_bin_edges[min_idx].astype("float32") + self._calib_max = calib_bin_edges[max_idx].astype("float32") + if self._calib_min < min_range: + self._calib_min = min_range + if self._calib_max > max_range: + self._calib_max = max_range + + def clear(self): + """Clear calibration range.""" + self._calib_min = None + self._calib_max = None + self.collector = None + + @property + def method_name(self): + """Get calibration method name.""" + return "Percentile" + + +@calib_registry(calib_method="Entropy") +class EntropyCalibrator(CalibratorBase): + """Entropy calibrator class. + + Args: + num_bins (int, optional):number of bins to create a new histogram + for collecting tensor values. Defaults to 128. + num_quantized_bins (int, optional): number of quantized bins. Defaults to 128. + """ + + def __init__(self, num_bins=128, num_quantized_bins=128): + """Initialize entropy calibrator class.""" + super(EntropyCalibrator, self).__init__() + self.collector = None + self.num_bins = num_bins + self.num_quantized_bins = num_quantized_bins + + def collect_calib_data(self, datas): + """Collect calibration range.""" + if not self.collector: + self.collector = HistogramCollector(self.num_bins) + self.collector.collect_data(datas) + self.compute_kl_range() + + def compute_kl_range(self): + """Compute entropy range.""" + histogram = self.collector.histogram + self._calib_min, self._calib_max = self.get_kl_threshold(histogram, self.num_quantized_bins) + + def get_kl_threshold(self, histogram, num_quantized_bins): + """Compute entropy threshold. + + Ref: + https://github.com//apache/incubator-mxnet/blob/master/python/mxnet/contrib/quantization.py + https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py + + Args: + histogram (tuple): hist, hist_edges, min, max and threshold + num_quantized_bins (int): number of quantized bins. + + Returns: + float: optimal threshold + """ + hist = histogram[0] + hist_edges = histogram[1] + num_bins = hist.size + zero_bin_index = num_bins // 2 + num_half_quantized_bin = num_quantized_bins // 2 + + kl_divergence = np.zeros(zero_bin_index - num_half_quantized_bin + 1) + thresholds = [(0, 0) for i in range(kl_divergence.size)] + + for i in range(num_half_quantized_bin, zero_bin_index + 1, 1): + start_index = zero_bin_index - i + end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins + + thresholds[i - num_half_quantized_bin] = ( + float(hist_edges[start_index]), + float(hist_edges[end_index]), + ) + + sliced_distribution = copy.deepcopy(hist[start_index:end_index]) + + # reference distribution p + p = sliced_distribution.copy() # a copy of np array + left_outliers_count = sum(hist[:start_index]) + right_outliers_count = sum(hist[end_index:]) + p[0] += left_outliers_count + p[-1] += right_outliers_count + + # nonzeros[i] incidates whether p[i] is non-zero + nonzeros = (p != 0).astype(np.int64) + + # quantize p.size bins into quantized bins (default 128 bins) + quantized_bins = np.zeros(num_quantized_bins, dtype=np.int64) + num_merged_bins = sliced_distribution.size // num_quantized_bins + + # merge bins into quantized bins + for index in range(num_quantized_bins): + start = index * num_merged_bins + end = start + num_merged_bins + quantized_bins[index] = sum(sliced_distribution[start:end]) + quantized_bins[-1] += sum(sliced_distribution[num_quantized_bins * num_merged_bins :]) + + # in order to compare p and q, we need to make length of q equals to length of p + # expand quantized bins into p.size bins + q = np.zeros(p.size, dtype=np.int64) + for index in range(num_quantized_bins): + start = index * num_merged_bins + end = start + num_merged_bins + + norm = sum(nonzeros[start:end]) + if norm != 0: + q[start:end] = float(quantized_bins[index]) / float(norm) + + p = smooth_distribution(p) + q = smooth_distribution(q) + + if isinstance(q, np.ndarray): + kl_divergence[i - num_half_quantized_bin] = stats.entropy(p, q) + else: + kl_divergence[i - num_half_quantized_bin] = float("inf") + + min_kl_divergence_idx = np.argmin(kl_divergence) + optimal_threshold = thresholds[min_kl_divergence_idx] + min_value = histogram[2] + max_value = histogram[3] + if optimal_threshold[0] < min_value: + optimal_threshold = (min_value, optimal_threshold[1]) + if optimal_threshold[1] > max_value: + optimal_threshold = (optimal_threshold[0], max_value) + return optimal_threshold[0], optimal_threshold[1] + + def clear(self): + """Clear calibration range.""" + self._calib_min = None + self._calib_max = None + self.collector = None + + @property + def method_name(self): + """Get calibration method name.""" + return "Entropy" + + +class HistogramCollector: + """Histogram collctor class.""" + + def __init__(self, num_bins=2048): + """Initialize histogram collctor.""" + self._num_bins = num_bins + self._histogram = None + + def collect_data(self, datas): + """Collect histogram data.""" + if isinstance(datas, list) and len(set([data.shape for data in datas])) != 1: + for data in datas: + if data.size == 0: # pragma: no cover + continue + self._collect_value(data) + else: + datas = np.asarray(datas) + datas = datas.flatten() + assert datas.size > 0, "collected intermediate data size" "should not be 0, please check augmented_model" + self._collect_value(datas) + + def _collect_value(self, data): + """Collect value.""" + data = np.asarray(data) + min_range = np.min(data) + max_range = np.max(data) + + th = max(abs(min_range), abs(max_range)) + if self._histogram is None: + hist, hist_edges = np.histogram(data, self._num_bins, range=(-th, th)) + self._histogram = (hist, hist_edges, min_range, max_range, th) + else: + self._histogram = self.combine_histogram(self._histogram, data, min_range, max_range, th) + + def combine_histogram(self, old_hist, data_arr, new_min, new_max, new_th): + """Combine histogram.""" + (old_hist, old_hist_edges, old_min, old_max, old_th) = old_hist + + if new_th <= old_th: + hist, _ = np.histogram(data_arr, bins=len(old_hist), range=(-old_th, old_th)) + return ( + old_hist + hist, + old_hist_edges, + min(old_min, new_min), + max(old_max, new_max), + old_th, + ) + else: + # Need to generate new histogram with new_th + if old_th == 0: + hist, hist_edges = np.histogram(data_arr, len(old_hist), range=(-new_th, new_th)) + hist += old_hist + else: + old_num_bins = len(old_hist) + old_step = 2 * old_th / old_num_bins + half_increased_bins = int((new_th - old_th) // old_step + 1) + new_num_bins = half_increased_bins * 2 + old_num_bins + new_th = half_increased_bins * old_step + old_th + hist, hist_edges = np.histogram(data_arr, bins=new_num_bins, range=(-new_th, new_th)) + hist[half_increased_bins : new_num_bins - half_increased_bins] += old_hist + return ( + hist, + hist_edges, + min(old_min, new_min), + max(old_max, new_max), + new_th, + ) + + @property + def histogram(self): + """Get histogram.""" + return self._histogram + + +def smooth_distribution(p, eps=0.0001): + """Smooth distribution. + + Given a discrete distribution (may have not been normalized to 1), + smooth it by replacing zeros with eps multiplied by a scaling factor + and taking the corresponding amount off the non-zero values. + Ref: + http://hanj.cs.illinois.edu/cs412/bk3/KL-divergence.pdf + https://github.com//apache/incubator-mxnet/blob/master/python/mxnet/contrib/quantization.py + https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py + + Args: + p (array): distribution array + eps (float, optional): a small probability. Defaults to 0.0001. + + Returns: + array: smoothed distribution + """ + is_zeros = (p == 0).astype(np.float32) + is_nonzeros = (p != 0).astype(np.float32) + n_zeros = is_zeros.sum() + n_nonzeros = p.size - n_zeros + + if not n_nonzeros: + return -1 + eps1 = eps * float(n_zeros) / float(n_nonzeros) + assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % ( + n_zeros, + n_nonzeros, + eps1, + ) + + hist = p.astype(np.float32) + hist += eps * is_zeros + (-eps1) * is_nonzeros + assert (hist <= 0).sum() == 0 + + return hist diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py new file mode 100644 index 000000000..454c3ea69 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Operators for onnx model.""" + +import glob +from os import path + +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + +modules = glob.glob(path.join(path.dirname(__file__), "*.py")) + +for f in modules: + if path.isfile(f) and not f.startswith("__") and not f.endswith("__init__.py"): + __import__(path.basename(f)[:-3], globals(), locals(), level=1) + +OPERATORS = base_op.OPERATORS diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py new file mode 100644 index 000000000..c06d92dac --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py @@ -0,0 +1,112 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Activation operator.""" + +import onnx + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="LeakyRelu, Sigmoid", mode=[constants.STATIC_QUANT]) +class ActivationOperator(base_op.Operator): + """Activation operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(ActivationOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0]) + if not data_found: + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + super().quantize() + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + + children = self.quantizer.model.get_children(node) + if len(children) == 0 or not node.name.endswith("_quant"): + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parent = self.quantizer.model.get_parents(node)[0] + child = self.quantizer.model.get_children(node)[0] + + inputs = [] + inputs.extend(parent.input) + inputs.extend(child.input[1:]) + + qlinear_activation_output = child.output[0] + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + kwargs["domain"] = quant_utils.ms_domain + + qlinear_activation_node = onnx.helper.make_node( + "QLinear" + node.op_type, inputs, [qlinear_activation_output], node.name, **kwargs + ) + + self.quantizer.new_nodes.append(qlinear_activation_node) + self.quantizer.remove_nodes.extend([parent, child, node]) + + +@base_op.op_registry(op_types="Relu, Clip", mode=[constants.STATIC_QUANT]) +class RemovableActivationOperator(base_op.Operator): + """Removable activation operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(RemovableActivationOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + if node.input[0] not in self.quantizer.quantized_value_map: + return False + return True + + def quantize(self): + """Do quantization.""" + node = self.node + if node.output[0] in [i.name for i in self.quantizer.model.model.graph.output]: + self.quantizer.dequantize_tensor(node, node.input[0]) + else: + self.quantizer.model.replace_input_of_all_nodes(node.output[0], node.input[0]) + self.quantizer.remove_nodes.append(node) + + +@base_op.op_registry( + op_types="Softmax, BiasGelu, Elu, Exp, FastGelu, Gelu, Softplus, Tanh", mode=[constants.STATIC_QUANT] +) +class Float16ActivationOperator(base_op.Operator): + """Float16 Activation operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(Float16ActivationOperator, self).__init__(onnx_quantizer, onnx_node) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py new file mode 100644 index 000000000..594e24c05 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ArgMax operator.""" + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="ArgMax", mode=[constants.STATIC_QUANT]) +class ArgMaxOperator(base_op.Operator): + """ArgMax operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(ArgMaxOperator, self).__init__(onnx_quantizer, onnx_node) + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + return True + + def convert(self): + """Convert to quantized format.""" + node = self.node + origin_name = node.input[0].split("_argmax_node")[0] + + if origin_name in self.quantizer.quantized_value_map: + node.name = node.name + "_quant" diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py new file mode 100644 index 000000000..46f102352 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py @@ -0,0 +1,71 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Attention operator.""" + +import onnx + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="Attention", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT]) +class AttentionOperator(base_op.Operator): + """Attention operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(AttentionOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node, [0, 1]) + node.name = node.name + "_quant" + + def convert(self): + """Convert QDQ mode to QOperator format.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + quantized_name = [] + scale = [] + zp = [] + for parent in parents[:2]: + if parent.op_type == "DynamicQuantizeLinear": + quantized_name.append(parent.output[0]) + scale.append(parent.output[1]) + zp.append(parent.output[2]) + elif parent.op_type == "DequantizeLinear": + quantized_name.append(parent.input[0]) + scale.append(parent.input[1]) + zp.append(parent.input[2]) + self.quantizer.remove_nodes.append(parent) + + inputs = [] + inputs.extend(quantized_name) + inputs.append(node.input[2]) + inputs.extend(scale) + inputs.append(node.input[3] if len(node.input) > 3 else "") + inputs.extend(zp) + if len(node.input) > 4: + inputs.append(node.input[4]) + + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + kwargs["domain"] = quant_utils.ms_domain + qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, node.name, **kwargs) + self.quantizer.new_nodes.append(qattention_node) + + self.quantizer.remove_nodes.append(node) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py new file mode 100644 index 000000000..a5d3bc62d --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py @@ -0,0 +1,92 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Base Operator.""" + +from onnx_neural_compressor import constants, quantization + +OPERATORS = { + "dynamic_quant": {}, + "static_quant": {}, +} + + +def op_registry(op_types, mode): + """The class decorator used to register all Operator subclasses.""" + + def decorator_op(cls): + assert cls.__name__.endswith( + "Operator" + ), "The name of subclass of Operator should end with 'Operator' substring." + for item in mode: + if cls.__name__[: -len("Operator")] in OPERATORS[item]: # pragma: no cover + raise ValueError("Cannot have two operators with the same name for {} mode.".format(item)) + break + for single_op_type in [op_type.strip() for op_type in op_types.split(",")]: + for item in mode: + OPERATORS[item][single_op_type] = cls + return cls + + return decorator_op + + +class Operator(object): + """Base Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + self.quantizer = onnx_quantizer + self.node = onnx_node + node_name = self.node.name.split("_quant")[0] + if node_name in self.quantizer.config: + self.dtype = self.quantizer.config[node_name] + self.disable_qdq_for_node_output = ( + True if onnx_node.op_type in onnx_quantizer.optypes_to_exclude_output_quant else False + ) + self.per_channel = False + self.calibrate_method = 0 # minmax + self.weight_sym = True + self.weight_dtype = None + self.activation_dtype = None + self.activation_sym = False + if node_name in self.quantizer.config: + if self.quantizer.config[node_name] not in self.quantizer.fallback_list: + self.per_channel = self.quantizer.config[node_name]["per_channel"] + self.calibrate_method = self.quantizer.config[node_name]["calibrate_method"] + self.weight_sym = self.quantizer.config[node_name]["weight_sym"] + self.weight_dtype = self.quantizer.config[node_name]["weight_type"] + self.activation_dtype = self.quantizer.config[node_name]["activation_type"] + self.activation_sym = self.quantizer.config[node_name]["activation_sym"] + + def quantize_check(self): + """Check if quantizaion can be done.""" + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node) + if not self.disable_qdq_for_node_output or self.quantizer.mode != constants.DYNAMIC_QUANT: + self.quantizer.quantize_outputs(node) + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + + if not node.name.endswith("_quant"): + return False + return True + + def convert(self): + """Convert to QOperator format.""" + return diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py new file mode 100644 index 000000000..4aa1637b7 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py @@ -0,0 +1,150 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Binary operator.""" + +import onnx + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="Add, Mul", mode=[constants.STATIC_QUANT]) +class BinaryOperator(base_op.Operator): + """Binary operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(BinaryOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0]) + if not data_found: + return False + if self.quantizer.execution_provider == "TensorrtExecutionProvider": + return True + if not all([self.quantizer.is_valid_quantize_weight(i) for i in node.input]): + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node, initializer_use_weight_qType=False) + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(node) + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + children = self.quantizer.model.get_children(node) + if len(children) == 0 or not node.name.endswith("_quant"): + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + child = self.quantizer.model.get_children(node)[0] + + qlinear_binary_math_output = child.output[0] + + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + kwargs["domain"] = quant_utils.ms_domain + + qlinear_binary_math_inputs = [] + for parent in parents: + qlinear_binary_math_inputs.extend(parent.input) + qlinear_binary_math_inputs.extend(child.input[1:]) + + qlinear_binary_math_node = onnx.helper.make_node( + "QLinear" + node.op_type, qlinear_binary_math_inputs, [qlinear_binary_math_output], node.name, **kwargs + ) + + self.quantizer.new_nodes += [qlinear_binary_math_node] + self.quantizer.remove_nodes.extend(parents) + self.quantizer.remove_nodes.append(child) + self.quantizer.remove_nodes.append(node) + + +@base_op.op_registry(op_types="Mod", mode=[constants.STATIC_QUANT]) +class BinaryDirect8BitOperator(base_op.Operator): + """Binary operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(BinaryDirect8BitOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0]) + if not data_found: + return False + if not all([self.quantizer.is_valid_quantize_weight(i) for i in node.input]): + return False + + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node, initializer_use_weight_qType=False) + if not self.disable_qdq_for_node_output or self.quantizer.mode != "qdq": + self.quantizer.quantize_outputs(node) + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + + children = self.quantizer.model.get_children(node) + if len(children) == 0 or not node.name.endswith("_quant"): + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + if any([i.op_type == "DequantizeLinear" for i in parents]) and any( + [i.op_type == "QuantizeLinear" for i in children] + ): + for idx, parent in enumerate(parents): + if parent.op_type == "DequantizeLinear": + self.node.input[idx] = parent.input[0] + self.quantizer.remove_nodes.append(parent) + for child in children: + if child.op_type == "QuantizeLinear": + self.quantizer.remove_nodes.append(child) + self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized") + node.output[0] = node.output[0] + "_quantized" + + +@base_op.op_registry( + op_types="Sum, Sub, Div, Pow, Equal, Greater, GreaterOrEqual, Less, LessOrEqual", mode=[constants.STATIC_QUANT] +) +class Float16BinaryOperator(base_op.Operator): + """Float16 Binary operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(Float16BinaryOperator, self).__init__(onnx_quantizer, onnx_node) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py new file mode 100644 index 000000000..9e0f0ff6b --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py @@ -0,0 +1,125 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Concat Operator.""" + +import onnx + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="Concat", mode=[constants.STATIC_QUANT]) +class ConcatOperator(base_op.Operator): + """Concat Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(ConcatOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + if len(node.input) == 1: # pragma: no cover + return False + inits = [i.name for i in self.quantizer.model.initializer()] + if all([inp not in self.quantizer.quantized_value_map and inp not in inits for inp in node.input]) or not all( + [inp in self.quantizer.quantized_value_map or inp in inits for inp in node.input] + ): + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + inits = [i.name for i in self.quantizer.model.initializer()] + for idx, inp in enumerate(node.input): + initializer_use_weight_qType = inp not in inits + self.quantizer.quantize_inputs(node, [idx], initializer_use_weight_qType) + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(node) + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + if len(children) == 0 or len(parents) == 0 or not node.name.endswith("_quant"): + return False + + # check input type + if all([i.op_type == "DequantizeLinear" for i in parents]) and any( + [i.op_type == "QuantizeLinear" for i in children] + ): + input_zp, input_scale, output_zp = [], [], [] + input_zp = [parent.input[2] for parent in parents] + input_scale = [parent.input[1] for parent in parents] + output_zp = [child.input[2] for child in children if child.op_type == "QuantizeLinear"] + + if ( + any([self.quantizer.model.get_initializer(zp) is None for zp in input_zp]) + or any([self.quantizer.model.get_initializer(zp) is None for zp in output_zp]) + or any([self.quantizer.model.get_initializer(scale) is None for scale in input_scale]) + ): # pragma: no cover + return False + + # check input scale is float type + if any( + [self.quantizer.model.get_initializer(scale).data_type != 1 for scale in input_scale] + ): # pragma: no cover + return False + # check input zp type is the same with output zp type + if any( + [ + self.quantizer.model.get_initializer(in_zp).data_type + not in [self.quantizer.model.get_initializer(out_zp).data_type for out_zp in output_zp] + for in_zp in input_zp + ] + ): + return False + + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + + if all([i.op_type == "DequantizeLinear" for i in parents]) and any( + [i.op_type == "QuantizeLinear" for i in children] + ): + inputs = [] + + inputs.extend([i for i in children if i.op_type == "QuantizeLinear"][0].input[1:]) + for parent in parents: + inputs.extend(parent.input) + self.quantizer.remove_nodes.append(parent) + for child in children: + if child.op_type == "QuantizeLinear": + self.quantizer.remove_nodes.append(child) + self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized") + + kwargs = {} + for attribute in node.attribute: + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + kwargs["domain"] = quant_utils.ms_domain + qlconcat_node = onnx.helper.make_node( + "QLinearConcat", inputs, [node.output[0] + "_quantized"], node.name, **kwargs + ) + + self.quantizer.new_nodes += [qlconcat_node] + self.quantizer.remove_nodes.append(node) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py new file mode 100644 index 000000000..ede7e1bfa --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py @@ -0,0 +1,201 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Conv Operator.""" + + +import onnx +from onnx import onnx_pb as onnx_proto + +from onnx_neural_compressor import constants +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="Conv, FusedConv", mode=[constants.DYNAMIC_QUANT]) +class ConvOperator(base_op.Operator): + """Conv Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(ConvOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize(self): + """Do quantizaion.""" + node = self.node + if node.op_type == "FusedConv": + kwargs = {} + for attribute in node.attribute: + if attribute.name == "activation" and attribute.s in [b"Relu", b"Clip"]: + continue + if attribute.name == "activation_params": + continue + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + conv = onnx.helper.make_node("Conv", node.input, node.output, node.name, **kwargs) + node.CopyFrom(conv) + + self.quantizer.quantize_inputs(node, [0]) + + if self.per_channel: + self.quantizer.quantize_weights_per_channel(node, [1], self.weight_dtype, self.weight_sym, 0) + else: + self.quantizer.quantize_inputs(node, [1]) + + if len(node.input) == 3: + self.quantizer.quantize_bias_tensor(node) + + node.name = node.name + "_quant" + + def convert(self): + """Convert to QOperator format.""" + node = self.node + inputs = [] + parents = self.quantizer.model.get_parents(node) + if parents[0].op_type == "QuantizeLinear": + inputs.append(parents[0].output[0]) + inputs.append(parents[1].input[0]) + inputs.append(parents[0].input[2]) + inputs.append(parents[1].input[2]) + scale_0 = parents[0].input[1] + else: + inputs.append(parents[0].output[0]) + inputs.append(parents[1].input[0]) + inputs.append(parents[0].output[2]) + inputs.append(parents[1].input[2]) + scale_0 = parents[0].output[1] + scale_1 = parents[1].input[1] + # quantize bias if exist + quantized_bias_name = "" + bias_present = False + if len(node.input) == 3: + quantized_bias_name = node.input[2] + "_quantized" + bias_present = True + + conv_integer_output = node.output[0] + "_output_quantized" + + kwargs = {} + for attribute in node.attribute: + if attribute.name == "activation" and attribute.s in [b"Relu", b"Clip"]: # pragma: no cover + continue + if attribute.name == "activation_params": # pragma: no cover + continue + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + conv_integer_node = onnx.helper.make_node("ConvInteger", inputs, [conv_integer_output], node.name, **kwargs) + self.quantizer.new_nodes.append(conv_integer_node) + + # Add bias add nodes + if bias_present: + conv_integer_output = self.quantizer.get_bias_add_nodes( + node, parents[1].input[0], conv_integer_output, quantized_bias_name + ) + + # Add cast operation to cast convInteger output to float. + cast_op_output = conv_integer_output + "_cast_output" + cast_node = onnx.helper.make_node( + "Cast", + [conv_integer_output], + [cast_op_output], + conv_integer_output + "_cast", + to=onnx_proto.TensorProto.FLOAT, + ) + self.quantizer.new_nodes.append(cast_node) + + # Add mul operation to multiply scales of two inputs. + scales_mul_op = node.name + "_scales_mul" + + scales_mul_node = quant_utils.find_by_name(scales_mul_op, self.quantizer.new_nodes) + if scales_mul_node is None: + scales_mul_node = onnx.helper.make_node("Mul", [scale_0, scale_1], [scales_mul_op + ":0"], scales_mul_op) + self.quantizer.new_nodes.append(scales_mul_node) + + scales_mul_op_output = scales_mul_node.output[0] + + # Add mul operation to multiply mul_scales_op result with output of ConvInteger + # and make the output of this node the same as output of original conv node. + output_scale_mul_op = node.name + "_output_scale_mul" + self.quantizer.new_nodes.append( + onnx.helper.make_node("Mul", [cast_op_output, scales_mul_op_output], [node.output[0]], output_scale_mul_op) + ) + self.quantizer.remove_nodes.extend(parents[1:]) + self.quantizer.remove_nodes.append(node) + + +@base_op.op_registry(op_types="Conv, FusedConv", mode=[constants.STATIC_QUANT]) +class StaticConvOperator(ConvOperator): + """Conv Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(ConvOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize(self): + """Do quantizaion.""" + node = self.node + if node.op_type == "FusedConv": + kwargs = {} + for attribute in node.attribute: + if attribute.name == "activation" and attribute.s in [b"Relu", b"Clip"]: + continue + if attribute.name == "activation_params": + continue + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + conv = onnx.helper.make_node("Conv", node.input, node.output, node.name, **kwargs) + node.CopyFrom(conv) + + self.quantizer.quantize_inputs(node, [0]) + + if self.per_channel: + self.quantizer.quantize_weights_per_channel(node, [1], self.weight_dtype, self.weight_sym, 0) + else: + self.quantizer.quantize_inputs(node, [1]) + + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(node) + + if len(node.input) == 3: + self.quantizer.quantize_bias_tensor(node) + + node.name = node.name + "_quant" + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + if len(self.quantizer.model.get_children(node)) == 0 or not node.name.endswith("_quant"): # pragma: no cover + return + parents = self.quantizer.model.get_parents(node) + child = self.quantizer.model.get_children(node)[0] + qlinear_conv_inputs = [] + for parent in parents[0:2]: + qlinear_conv_inputs.extend(parent.input) + qlinear_conv_inputs.extend(child.input[1:]) + if len(parents) == 3: + qlinear_conv_inputs.append(parents[-1].input[0]) + + qlinear_conv_output = child.output[0] + + kwargs = {} + for attribute in node.attribute: + if attribute.name == "activation" and attribute.s in [b"Relu", b"Clip"]: # pragma: no cover + continue + if attribute.name == "activation_params": # pragma: no cover + continue + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + + qlinear_conv_node = onnx.helper.make_node( + "QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], node.name, **kwargs + ) + self.quantizer.new_nodes.append(qlinear_conv_node) + self.quantizer.remove_nodes.extend(parents) + self.quantizer.remove_nodes.append(child) + self.quantizer.remove_nodes.append(node) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py new file mode 100644 index 000000000..77d09793b --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py @@ -0,0 +1,78 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Direct8Bit Operator.""" + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry( + op_types="Reshape, Transpose, Squeeze, Unsqueeze, Flatten, Expand, Slice, " + "SpaceToDepth, DepthToSpace, Upsample, Tile, CenterCropPad", + mode=[constants.STATIC_QUANT], +) +class Direct8BitOperator(base_op.Operator): + """Direct8Bit Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(Direct8BitOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + if not self.quantizer.is_valid_quantize_weight(node.input[0]): + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(self.node, [0], direct_int8=True) + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(self.node, direct_int8=True) + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"): + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + if any([i.op_type == "DequantizeLinear" for i in parents]) and any( + [i.op_type == "QuantizeLinear" for i in children] + ): + for parent in parents: + if parent.op_type == "DequantizeLinear": + # make sure parent DequantizeLinear of input 0 is not used by other ops + if len(self.quantizer.model.get_children(parent)) == 1 and not self.quantizer.model.is_graph_output( + parents[0].output[0] + ): + self.quantizer.remove_nodes.append(parent) + self.node.input[0] = parent.input[0] + break + for child in children: + if child.op_type == "QuantizeLinear": + self.quantizer.remove_nodes.append(child) + self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized") + node.output[0] = node.output[0] + "_quantized" diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py new file mode 100644 index 000000000..0b9967f3d --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py @@ -0,0 +1,68 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""EmbedLayerNormalization Operator.""" + +import onnx + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="EmbedLayerNormalization", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT]) +class EmbedLayerNormalizationOperator(base_op.Operator): + """EmbedLayerNormalization Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(EmbedLayerNormalizationOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node, [2, 3, 4, 5, 6]) + node.name = node.name + "_quant" + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parents = [i for i in self.quantizer.model.get_parents(node) if i.op_type == "DequantizeLinear"] + inputs = [] + # 'input_ids' + inputs.extend([node.input[0]]) + # 'segment_ids' + inputs.extend([node.input[1]]) + for parent in parents: + inputs.append(parent.input[0]) + # 'mask' (optional) + if len(node.input) > 7: + inputs.append(node.input[7]) + + for parent in parents: + inputs.append(parent.input[1]) + for parent in parents: + inputs.append(parent.input[2]) + + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + kwargs["domain"] = quant_utils.ms_domain + + qembed_layer_norm_node = onnx.helper.make_node( + "QEmbedLayerNormalization", inputs, node.output, node.name, **kwargs + ) + self.quantizer.new_nodes.append(qembed_layer_norm_node) + self.quantizer.remove_nodes.extend(parents) + self.quantizer.remove_nodes.append(node) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py new file mode 100644 index 000000000..e337125b2 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py @@ -0,0 +1,109 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Gather Operator.""" + +import onnx + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry( + op_types="Gather, GatherElements, GatherND", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT] +) +class GatherOperator(base_op.Operator): + """Gather Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(GatherOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + if not self.quantizer.is_valid_quantize_weight(node.input[0]): + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node, [0], initializer_use_weight_qType=False) + if not self.disable_qdq_for_node_output or self.quantizer.mode != constants.DYNAMIC_QUANT: + self.quantizer.quantize_outputs(node) + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + if len(children) == 0 or len(parents) == 0 or not node.name.endswith("_quant"): + return False + + return True + + def convert(self): + """Convert to QOperator format.""" + # DQ-Gather-Q-DQ-op + node = self.node + + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + + if any([i.op_type == "DequantizeLinear" for i in parents]): + + inputs = [] + inputs.append(parents[0].input[0]) + inputs.append(node.input[1]) + + out_scale = 1.0 + out_zp = 0 + gather_new_output = node.output[0] + "_quantized" # dynamic quant output name + for child in children: + if child.op_type == "QuantizeLinear": + out_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[1])) + out_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[2])) + gather_new_output = children[0].output[0] # static quant output name + self.quantizer.remove_nodes.append(child) + + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + + gather_node = onnx.helper.make_node(node.op_type, inputs, [gather_new_output], node.name, **kwargs) + self.quantizer.new_nodes.append(gather_node) + if any([i.op_type != "QuantizeLinear" for i in children]): + dq_inputs = [] + dq_inputs.append(gather_new_output) + dq_inputs.extend(parents[0].input[1:]) + dq_node = onnx.helper.make_node( + "DequantizeLinear", dq_inputs, [node.output[0]], node.name + "_DequantizeLinear" + ) + self.quantizer.new_nodes.append(dq_node) + + # int8 weight will be recalculated for the first time + if ( + any([child.op_type == "QuantizeLinear" for child in children]) + and self.quantizer.model.get_initializer(parents[0].input[0]) is not None + and parents[0].input[0] not in self.quantizer.recalculate_quantized_value + ): + int8_tensor = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0])) + in_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1])) + in_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2])) + new_int8_tensor = (((int8_tensor.astype("float32") - in_zp) * in_scale) / out_scale).round() + out_zp + self.quantizer.model.set_initializer(parents[0].input[0], new_int8_tensor.astype(int8_tensor.dtype)) + self.quantizer.recalculate_quantized_value.append(parents[0].input[0]) + self.quantizer.remove_nodes.extend([node, parents[0]]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py new file mode 100644 index 000000000..a91c1e531 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""GlobalAveragePool Operator.""" + +import onnx + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="GlobalAveragePool", mode=[constants.STATIC_QUANT]) +class GlobalAveragePoolOperator(base_op.Operator): + """GlobalAveragePool Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(GlobalAveragePoolOperator, self).__init__(onnx_quantizer, onnx_node) + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + children = self.quantizer.model.get_children(node) + if len(children) == 0: # pragma: no cover + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parent = self.quantizer.model.get_parents(node)[0] + child = self.quantizer.model.get_children(node)[0] + + kwargs = {} + for attribute in node.attribute: + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + kwargs["domain"] = quant_utils.ms_domain + kwargs["channels_last"] = 0 + + inputs = parent.input + inputs.extend(child.input[1:]) + + qnode = onnx.helper.make_node("QLinear" + node.op_type, inputs, child.output, node.name + "_quant", **kwargs) + self.quantizer.new_nodes += [qnode] + self.quantizer.remove_nodes.append(child) + self.quantizer.remove_nodes.append(parent) + self.quantizer.remove_nodes.append(node) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py new file mode 100644 index 000000000..8d0b61c73 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py @@ -0,0 +1,91 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Gemm Operator.""" + +import onnx + +from onnx_neural_compressor import constants, logger +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="Gemm", mode=[constants.STATIC_QUANT]) +class GemmOperator(base_op.Operator): + """Gemm Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(GemmOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + if len(node.input) == 3 and not quant_utils.find_by_name(node.input[2], self.quantizer.model.initializer()): + + logger.warning( + "Bias of Gemm node '{}' is not constant. " + "Exclude this node can get better performance.".format(node.name) + ) + if self.quantizer.quant_format != "qdq": + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node, [0]) + if self.per_channel and quant_utils.find_by_name(node.input[1], self.quantizer.model.initializer()): + self.quantizer.quantize_weights_per_channel( + node, [1], self.weight_dtype, self.weight_sym, 0 if quant_utils.is_B_transposed(node) else 1 + ) + else: + self.quantizer.quantize_inputs(node, [1]) + + if len(node.input) == 3 and quant_utils.find_by_name(node.input[2], self.quantizer.model.initializer()): + self.quantizer.quantize_bias_tensor(node) + beta_attribute = [attr for attr in node.attribute if attr.name == "beta"] + if len(beta_attribute): + beta_attribute[0].f = 1.0 + + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(node) + node.name = node.name + "_quant" + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parents = self.quantizer.model.get_parents(node) + qgemm_inputs = [] + for parent in parents[:-1]: + qgemm_inputs.extend(parent.input) + qgemm_inputs.append(parents[-1].input[0]) + + kwargs = {} + for attribute in node.attribute: + if attribute.name != "beta": + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + kwargs["domain"] = quant_utils.ms_domain + + qgemm_output = node.output[0] + if not self.disable_qdq_for_node_output: + child = self.quantizer.model.get_children(node)[0] + self.quantizer.remove_nodes.append(child) + qgemm_output = child.output[0] + qgemm_inputs.extend(child.input[1:]) + qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], node.name, **kwargs) + + self.quantizer.new_nodes.append(qgemm_node) + self.quantizer.remove_nodes.extend(parents) + self.quantizer.remove_nodes.append(node) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py new file mode 100644 index 000000000..8499f2441 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py @@ -0,0 +1,138 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""LSTM Operator.""" + +import numpy +import onnx + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="LSTM", mode=[constants.DYNAMIC_QUANT]) +class LSTMOperator(base_op.Operator): + """LSTM Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(LSTMOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize(self): + """Do quantizaion.""" + return + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + + if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight( + node.input[2] + ): # pragma: no cover + return False + + model = self.quantizer.model + W = model.get_initializer(node.input[1]) + R = model.get_initializer(node.input[2]) + + if len(W.dims) != 3 or len(R.dims) != 3: # pragma: no cover + return False + + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + model = self.quantizer.model + W = model.get_initializer(self.node.input[1]) + R = model.get_initializer(self.node.input[2]) + + [W_num_dir, W_4_hidden_size, W_input_size] = W.dims + [R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims + + if self.per_channel: # pragma: no cover + del W.dims[0] + del R.dims[0] + W.dims[0] = W_num_dir * W_4_hidden_size + R.dims[0] = R_num_dir * R_4_hidden_size + + quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel( + node.input[1], self.weight_dtype, self.weight_sym, 0 + ) + quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel( + node.input[2], self.weight_dtype, self.weight_sym, 0 + ) + + W_quant_weight = model.get_initializer(quant_input_weight_tuple[0]) + R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0]) + + W_quant_array = onnx.numpy_helper.to_array(W_quant_weight) + R_quant_array = onnx.numpy_helper.to_array(R_quant_weight) + + W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size)) + R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size)) + + W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1)) + R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1)) + + W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0]) + R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0]) + + model.remove_initializers([W_quant_weight, R_quant_weight]) + model.add_initializer(W_quant_tranposed) + model.add_initializer(R_quant_tranposed) + + W_quant_zp = model.get_initializer(quant_input_weight_tuple[1]) + R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1]) + W_quant_scale = model.get_initializer(quant_input_weight_tuple[2]) + R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2]) + + if self.per_channel: # pragma: no cover + W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size] + R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size] + W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size] + R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size] + + inputs = [] + input_len = len(node.input) + inputs.extend([node.input[0]]) + inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]]) + inputs.extend([node.input[3] if input_len > 3 else ""]) + inputs.extend([node.input[4] if input_len > 4 else ""]) + inputs.extend([node.input[5] if input_len > 5 else ""]) + inputs.extend([node.input[6] if input_len > 6 else ""]) + inputs.extend([node.input[7] if input_len > 7 else ""]) + inputs.extend( + [ + quant_input_weight_tuple[2], + quant_input_weight_tuple[1], + quant_recurrent_weight_tuple[2], + quant_recurrent_weight_tuple[1], + ] + ) + + kwargs = {} + for attribute in node.attribute: + if attribute.name == "layout": + continue + kwarg = quant_utils.attribute_to_kwarg(attribute) + kwargs.update(kwarg) + + quant_lstm_name = node.name + "_quant" + quant_lstm_node = onnx.helper.make_node( + "DynamicQuantizeLSTM", inputs, node.output, quant_lstm_name, domain="com.microsoft", **kwargs + ) + self.quantizer.remove_nodes.append(node) + self.quantizer.new_nodes.append(quant_lstm_node) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py new file mode 100644 index 000000000..eff98f533 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py @@ -0,0 +1,168 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MatMul Operator.""" + +import onnx +from onnx import onnx_pb as onnx_proto + +from onnx_neural_compressor import constants +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="MatMul", mode=[constants.DYNAMIC_QUANT]) +class MatMulOperator(base_op.Operator): + """MatMul Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(MatMulOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + if not all([self.quantizer.model.get_initializer(i) is None for i in node.input]): + return True + elif all([i not in self.quantizer.quantized_value_map for i in node.input]): + return False + else: + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node, [0]) + if self.per_channel and quant_utils.find_by_name(node.input[1], self.quantizer.model.initializer()): + self.quantizer.quantize_weights_per_channel(node, [1], self.weight_dtype, self.weight_sym, 1) + else: + self.quantizer.quantize_inputs(node, [1]) + + node.name = node.name + "_quant" + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parents = self.quantizer.model.get_parents(node) + + inputs = [] + quantized_name = [] + scale = [] + zp = [] + for parent in parents: + if parent.op_type == "DequantizeLinear": + quantized_name.append(parent.input[0]) + else: + quantized_name.append(parent.output[0]) + if parent.op_type == "DynamicQuantizeLinear": + scale.append(parent.output[1]) + zp.append(parent.output[2]) + else: + scale.append(parent.input[1]) + zp.append(parent.input[2]) + inputs.extend(quantized_name) + inputs.extend(zp) + matmul_integer_output = node.output[0] + "_output_quantized" + matmul_integer_node = onnx.helper.make_node("MatMulInteger", inputs, [matmul_integer_output], node.name) + self.quantizer.new_nodes.append(matmul_integer_node) + + # Add cast operation to cast matmulInteger output to float. + cast_op_output = matmul_integer_output + "_cast_output" + cast_node = onnx.helper.make_node( + "Cast", + [matmul_integer_output], + [cast_op_output], + matmul_integer_output + "_cast", + to=onnx_proto.TensorProto.FLOAT, + ) + self.quantizer.new_nodes.append(cast_node) + + # Add mul operation to multiply scales of two inputs. + scales_mul_op = node.name + "_scales_mul" + + scales_mul_node = quant_utils.find_by_name(scales_mul_op, self.quantizer.new_nodes) + if scales_mul_node is None: + scales_mul_node = onnx.helper.make_node("Mul", [scale[0], scale[1]], [scales_mul_op + ":0"], scales_mul_op) + self.quantizer.new_nodes.append(scales_mul_node) + + scales_mul_op_output = scales_mul_node.output[0] + + # Add mul operation to multiply mul_scales_op result with output of MatMulInteger + # and make the output of this node the same as output of original matmul node. + output_scale_mul_op = node.name + "_output_scale_mul" + self.quantizer.new_nodes.append( + onnx.helper.make_node("Mul", [cast_op_output, scales_mul_op_output], [node.output[0]], output_scale_mul_op) + ) + if parents[1].op_type == "DequantizeLinear": + self.quantizer.remove_nodes.append(parents[1]) + self.quantizer.remove_nodes.append(node) + + +@base_op.op_registry(op_types="MatMul", mode=[constants.STATIC_QUANT]) +class StaticMatMulOperator(MatMulOperator): + """MatMul Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(MatMulOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node, [0]) + if self.per_channel and quant_utils.find_by_name(node.input[1], self.quantizer.model.initializer()): + self.quantizer.quantize_weights_per_channel(node, [1], self.weight_dtype, self.weight_sym, 1) + else: + self.quantizer.quantize_inputs(node, [1]) + + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(node) + node.name = node.name + "_quant" + + def convert(self): + """Convert to QOperator format.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + if len(self.quantizer.model.get_children(node)) == 0 or not node.name.endswith("_quant"): # pragma: no cover + return + + qlinear_matmul_inputs = [] + if self.disable_qdq_for_node_output: + for i in range(len(parents[0].input)): + qlinear_matmul_inputs.extend([parent.input[i] for parent in parents]) + qlinear_matmul_node = onnx.helper.make_node( + "MatMulIntegerToFloat", qlinear_matmul_inputs, node.output, node.name, domain="com.microsoft" + ) + else: + # after inserting QDQ, MatMul -> Q-DQ-MatMul-Q-DQ + for parent in parents: + qlinear_matmul_inputs.extend(parent.input) + + child = self.quantizer.model.get_children(node)[0] + qlinear_matmul_output = child.output[0] + qlinear_matmul_inputs.extend(child.input[1:]) + qlinear_matmul_node = onnx.helper.make_node( + "QLinearMatMul", qlinear_matmul_inputs, [qlinear_matmul_output], node.name + ) + self.quantizer.remove_nodes.append(child) + self.quantizer.new_nodes.append(qlinear_matmul_node) + self.quantizer.remove_nodes.append(node) + + # make sure parent DequantizeLinear of input 0 is not used by other ops + if len(self.quantizer.model.get_children(parents[0])) == 1 and not self.quantizer.model.is_graph_output( + parents[0].output[0] + ): + self.quantizer.remove_nodes.extend(parents) + else: + self.quantizer.remove_nodes.append(parents[1]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py new file mode 100644 index 000000000..cd5119c13 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py @@ -0,0 +1,74 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MaxPool Operator.""" + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="MaxPool", mode=[constants.STATIC_QUANT]) +class MaxPoolOperator(base_op.Operator): + """MaxPool Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(MaxPoolOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + # if opset version is less than 12, just no change + if self.quantizer.opset_version < 12: # pragma: no cover + return False + + if not self.quantizer.is_valid_quantize_weight(node.input[0]): # pragma: no cover + return False + + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(self.node, direct_int8=True) + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(self.node, direct_int8=True) + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + children = self.quantizer.model.get_children(node) + if len(children) == 0 or not node.name.endswith("_quant"): # pragma: no cover + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + parent = self.quantizer.model.get_parents(node)[0] + children = self.quantizer.model.get_children(node) + if parent.op_type != "DequantizeLinear" or all( + [i.op_type != "QuantizeLinear" for i in children] + ): # pragma: no cover + return + node.input[0] = parent.input[0] + node.output[0] = node.output[0].replace("_QuantizeInput", "_quantized") + for child in children: + if child.op_type == "QuantizeLinear": + self.quantizer.remove_nodes.append(child) + for n in self.quantizer.model.get_children(child): + self.quantizer.model.replace_node_input(n, child.output[0], node.output[0]) + + self.quantizer.remove_nodes.append(parent) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py new file mode 100644 index 000000000..6ffe742b5 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py @@ -0,0 +1,102 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Pad Operator.""" + +import onnx + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="Pad", mode=[constants.STATIC_QUANT]) +class PadOperator(base_op.Operator): + """Pad Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(PadOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + # if opset version is less than 11, just no change + if self.quantizer.opset_version < 11: # pragma: no cover + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node, [0]) + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(node) + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + children = self.quantizer.model.get_children(node) + if len(children) == 0 or not node.name.endswith("_quant"): # pragma: no cover + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parent = self.quantizer.model.get_parents(node)[0] + child = self.quantizer.model.get_children(node)[0] + + kwargs = {} + for attribute in node.attribute: + kv = quant_utils.attribute_to_kwarg(attribute) + kwargs.update(kv) + + if "mode" not in kwargs or kwargs["mode"] == b"constant": + if len(node.input) > 2: # There is 3rd input 'constant_value' + zp_tensor = self.quantizer.model.get_initializer(parent.input[2]) + scale_tensor = self.quantizer.model.get_initializer(parent.input[1]) + + padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2]) + if padding_constant_initializer is not None: + zp_array = onnx.numpy_helper.to_array(zp_tensor) + zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0] + scale_array = onnx.numpy_helper.to_array(scale_tensor) + scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0] + padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer) + quantized_padding_constant_array = quant_utils.quantize_nparray( + onnx.helper.tensor_dtype_to_np_dtype(self.weight_dtype), + padding_constant_array, + scale_value, + zp_value, + ) + quantized_padding_constant_name = node.input[2] + "_quantized" + quantized_padding_constant_initializer = onnx.numpy_helper.from_array( + quantized_padding_constant_array, quantized_padding_constant_name + ) + # Suppose this padding constant initializer only used by the node + self.quantizer.model.remove_initializer(padding_constant_initializer) + self.quantizer.model.add_initializer(quantized_padding_constant_initializer) + node.input[2] = quantized_padding_constant_name + else: + self.quantizer.quantize_inputs(node, [2], False) + node.input[2] = node.input[2] + "_DequantizeLinear" + else: + # pad zero_point for original zero + node.input.extend([parent.input[2]]) + + # Create an entry for output quantized value + node.input[0] = parent.input[0] + node.output[0] = child.output[0] + self.quantizer.remove_nodes.extend([parent, child]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py new file mode 100644 index 000000000..fb97ce630 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py @@ -0,0 +1,81 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AveragePool Operator.""" + +import onnx + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="AveragePool", mode=[constants.STATIC_QUANT]) +class PoolOperator(base_op.Operator): + """AveragePool Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(PoolOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + if not self.quantizer.is_valid_quantize_weight(node.input[0]): + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + super().quantize() + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + + if len(children) == 0 or len(parents) == 0 or not node.name.endswith("_quant"): + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + + if all([i.op_type == "DequantizeLinear" for i in parents]) and any( + [i.op_type == "QuantizeLinear" for i in children] + ): + qlinear_output_name = node.output[0] + "_quantized" + inputs = [] + inputs.extend(parents[0].input) + inputs.extend([i for i in children if i.op_type == "QuantizeLinear"][0].input[1:]) + kwargs = {} + for attribute in node.attribute: + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + kwargs["domain"] = quant_utils.ms_domain + qnode = onnx.helper.make_node("QLinear" + node.op_type, inputs, [qlinear_output_name], node.name, **kwargs) + + self.quantizer.remove_nodes.extend(parents) + for child in children: + if child.op_type == "QuantizeLinear": + self.quantizer.remove_nodes.append(child) + self.quantizer.model.replace_input_of_all_nodes(child.output[0], qnode.output[0]) + + self.quantizer.new_nodes.append(qnode) + self.quantizer.remove_nodes.append(node) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py new file mode 100644 index 000000000..f89000e2e --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py @@ -0,0 +1,83 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Reduce Operator.""" + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry( + op_types="ReduceMean, ReduceLogSum, ReduceLogSumExp, " "ReduceL1, ReduceL2, ReduceProd, ReduceSum, ReduceSumSquare", + mode=[constants.STATIC_QUANT], +) +class ReduceOperator(base_op.Operator): + """Reduce Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(ReduceOperator, self).__init__(onnx_quantizer, onnx_node) + + +@base_op.op_registry(op_types="ReduceMax, ReduceMin", mode=[constants.STATIC_QUANT]) +class ReduceMinMaxOperator(base_op.Operator): + """ReduceMin and ReduceMax Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(ReduceMinMaxOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + if not self.quantizer.is_valid_quantize_weight(node.input[0]): + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(self.node, [0], direct_int8=True) + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(self.node, direct_int8=True) + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"): + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + if any([i.op_type == "DequantizeLinear" for i in parents]) and any( + [i.op_type == "QuantizeLinear" for i in children] + ): + for parent in parents: + if parent.op_type == "DequantizeLinear": + self.node.input[0] = parent.input[0] + self.quantizer.remove_nodes.append(parents[0]) + break + for child in children: + if child.op_type == "QuantizeLinear": + self.quantizer.remove_nodes.append(child) + self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized") + node.output[0] = node.output[0] + "_quantized" diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py new file mode 100644 index 000000000..0cba83441 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py @@ -0,0 +1,75 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Resize Operator.""" + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="Resize", mode=[constants.STATIC_QUANT]) +class ResizeOperator(base_op.Operator): + """Resize Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(ResizeOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + # if version is less than 11, just keep this node + if self.quantizer.opset_version < 11: + return False + if not self.quantizer.is_valid_quantize_weight(node.input[0]): + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node, [0], direct_int8=True) + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(self.node, direct_int8=True) + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"): + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + + if any([i.op_type == "DequantizeLinear" for i in parents]) and any( + [i.op_type == "QuantizeLinear" for i in children] + ): + for parent in parents: + if parent.op_type == "DequantizeLinear" and parent.output[0] == node.input[0]: + self.node.input[0] = parent.input[0] + self.quantizer.remove_nodes.append(parent) + break + for child in children: + if child.op_type == "QuantizeLinear": + self.quantizer.remove_nodes.append(child) + self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized") + node.output[0] = node.output[0] + "_quantized" diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py new file mode 100644 index 000000000..3192b51d1 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py @@ -0,0 +1,88 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Split Operator.""" + +import onnx + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="Split", mode=[constants.STATIC_QUANT]) +class SplitOperator(base_op.Operator): + """Split Operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(SplitOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0]) + if not data_found: + return False + if not all([self.quantizer.is_valid_quantize_weight(i) for i in node.input]): + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(node, [0]) + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(self.node, direct_int8=True) + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + parent = self.quantizer.model.get_parents(node)[0] + children = self.quantizer.model.get_children(node) + if ( + parent.op_type != "DequantizeLinear" or len(children) == 0 or not node.name.endswith("_quant") + ): # pragma: no cover + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parent = self.quantizer.model.get_parents(node)[0] + kwargs = {} + for attribute in node.attribute: # pragma: no cover + kwargs.update(quant_utils.attribute_to_kwarg(attribute)) + + quantized_input_names = [] + quantized_input_names.append(parent.input[0]) + if len(node.input) > 1: # pragma: no cover + quantized_input_names.extend(node.input[1:]) + outputs = [] + input_name_to_nodes = self.quantizer.model.input_name_to_nodes() + for output in node.output: + if output in input_name_to_nodes: + child = input_name_to_nodes[output][0] + if child.op_type == "QuantizeLinear": + self.quantizer.remove_nodes.append(child) + outputs.append(child.output[0]) + else: # pragma: no cover + outputs.append(output) + else: # pragma: no cover + outputs.append(output + "_quantized") + + quantized_node = onnx.helper.make_node(node.op_type, quantized_input_names, outputs, node.name, **kwargs) + self.quantizer.new_nodes.append(quantized_node) + self.quantizer.remove_nodes.extend([parent, node]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py new file mode 100644 index 000000000..87c402b99 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py @@ -0,0 +1,80 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unary operator.""" + +from onnx_neural_compressor import constants, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +@base_op.op_registry(op_types="Exp, Log, Round, Sqrt", mode=[constants.STATIC_QUANT]) +class UnaryOperator(base_op.Operator): + """Unary operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(UnaryOperator, self).__init__(onnx_quantizer, onnx_node) + + +@base_op.op_registry(op_types="Abs, Shrink, Sign", mode=[constants.STATIC_QUANT]) +class UnaryDirect8BitOperator(base_op.Operator): + """Unary operator.""" + + def __init__(self, onnx_quantizer, onnx_node): + """Initialization.""" + super(UnaryDirect8BitOperator, self).__init__(onnx_quantizer, onnx_node) + + def quantize_check(self): + """Check if quantizaion can be done.""" + node = self.node + if not self.quantizer.is_valid_quantize_weight(node.input[0]): + return False + return True + + def quantize(self): + """Do quantizaion.""" + node = self.node + self.quantizer.quantize_inputs(self.node, [0], direct_int8=True) + if not self.disable_qdq_for_node_output: + self.quantizer.quantize_outputs(self.node, direct_int8=True) + node.name = node.name + "_quant" + + def convert_check(self): + """Check if conversion can be done.""" + node = self.node + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"): + return False + return True + + def convert(self): + """Convert to QOperator format.""" + node = self.node + + parents = self.quantizer.model.get_parents(node) + children = self.quantizer.model.get_children(node) + if any([i.op_type == "DequantizeLinear" for i in parents]) and any( + [i.op_type == "QuantizeLinear" for i in children] + ): + for parent in parents: + if parent.op_type == "DequantizeLinear": + self.node.input[0] = parent.input[0] + self.quantizer.remove_nodes.append(parents[0]) + break + for child in children: + if child.op_type == "QuantizeLinear": + self.quantizer.remove_nodes.append(child) + self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized") + node.output[0] = node.output[0] + "_quantized" diff --git a/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py b/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py new file mode 100644 index 000000000..8fb49d2d0 --- /dev/null +++ b/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py @@ -0,0 +1,1246 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Quantizer for onnx models.""" + +import copy +import logging +import os + +import numpy as np +import onnx +import onnxruntime as ort + +from onnx_neural_compressor import logger, onnx_model, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op + + +class Quantizer: + """Quantizer class.""" + + def __init__( + self, + model, + q_config, + mode, + static, + quantization_params, + op_types_to_quantize, + fallback_list=["fp32"], + reduce_range=None, + add_qdq_pair_to_weight=False, + optypes_to_exclude_output_quant=[], + dedicated_qdq_pair=False, + execution_provider="CPUExecutionProvider", + ): + """Initialization. + + Args: + model (ModelProto or onnx_model.ONNXModel): onnx model or onnx model wrapper by neural compressor + q_config (dict): op-wise quantization config. + mode (str): quantizaion mode + static (bool): static or not + quantization_params (dict): scale and zero point of tensors + op_types_to_quantize (list): optypes to quantize + fallback_list (list, optional): fallback data type. Defaults to ['fp32']. + reduce_range (bool, optional): use 7 bit or not. Defaults to None. + add_qdq_pair_to_weight (bool, optional): add QDQ pair to weight or not. Defaults to False. + optypes_to_exclude_output_quant (list, optional): optypes to exclude output quantization. Defaults to []. + dedicated_qdq_pair (bool, optional): dedicate QDQ pair or not. Defaults to False. + execution_provider (str, optional): execution_provider of onnxrt adaptor. Defaults to CPUExecutionProvider + """ + self.model = onnx_model.ONNXModel(model) if not isinstance(model, onnx_model.ONNXModel) else model + model = ( + onnx.shape_inference.infer_shapes(self.model.model) if not self.model.is_large_model else self.model.model + ) + self.config = q_config + self.execution_provider = execution_provider + self.reduce_range = reduce_range + self.mode = mode + self.quant_format = None + self.static = static # use static quantization for inputs. + self.fuse_dynamic_quant = False + self.quantization_params = quantization_params + self.op_types_to_quantize = op_types_to_quantize + self.fallback_list = fallback_list + self.new_nodes = [] + + self.opset_version = self.check_opset_version() + self.value_infos = {vi.name: vi for vi in model.graph.value_info} + self.value_infos.update({ot.name: ot for ot in model.graph.output}) + self.value_infos.update({it.name: it for it in model.graph.input}) + self.replace_input = [] + self.remove_nodes = [] + # List of quantized weights + self.quantized_value_map = {} + self.new_value_info = {} + + # List of recalculated quantize weight for Gather op. + self.recalculate_quantized_value = [] + + # QuantizeRange tensor name and zero tensor name for scale and zero point calculation. + # Used when static is False + self.fixed_qrange_uint8_name = "fixed_quantization_range_uint8" + self.fixed_qrange_int8_name = "fixed_quantization_range_int8" + # For uint8 data-type, to compute zero point, we subtract rmin from 0 (represented by fixed_zero_name tensor) + self.fixed_zero_name = "fixed_zero" + # For int8 data-type, zero point is always zero (represented by fixed_zero_point_name tensor) + self.fixed_zero_zp_name = "fixed_zero_zp" + + if not self.static: + self.optypes_to_exclude_output_quant = op_types_to_quantize + else: + self.optypes_to_exclude_output_quant = optypes_to_exclude_output_quant + + self.add_qdq_pair_to_weight = add_qdq_pair_to_weight + self.dedicated_qdq_pair = dedicated_qdq_pair + + def check_opset_version(self): + """Check opset version.""" + ai_onnx_domain = [ + opset for opset in self.model.model.opset_import if not opset.domain or opset.domain == "ai.onnx" + ] + if 1 != len(ai_onnx_domain): + raise ValueError("Failed to find proper ai.onnx domain") + opset_version = ai_onnx_domain[0].version + + if opset_version > 10: + self.fuse_dynamic_quant = True + elif opset_version < 10: + logger.warning( + f"Warning: The original model opset version is {opset_version}, which does not support node " + + "fusions. Please update the model to opset >= 11 for better performance." + ) + self.model.model.opset_import.remove(ai_onnx_domain[0]) + self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)]) + opset_version = 11 + + return opset_version + + def should_quantize(self, node): + """Check if node should be quantized.""" + if node.name in self.config and self.config[node.name] not in self.fallback_list: + return True + elif ( + quant_utils.get_node_original_name(node) in self.config + and self.config[quant_utils.get_node_original_name(node)] not in self.fallback_list + ): + return True + else: + return False + + def should_convert(self, node): + """Check if node should be converted.""" + name = quant_utils.get_node_original_name(node) + if name in self.config and self.config[name] not in self.fallback_list: + return True + else: + return False + + def _postprocess(self): + if "TensorrtExecutionProvider" in self.execution_provider: + utility.trt_env_setup(self.model.model) + self.merge_dedicated_qdq_pair() + self.model.remove_unused_nodes() + + self.model.model.producer_name = quant_utils.__producer__ + self.model.model.producer_version = quant_utils.__version__ + + def _preprocess(self): + quant_utils.remove_init_from_model_input(self.model) + quant_utils.split_shared_bias(self.model) + + def quantize_model(self): + """Quantize onnx model.""" + self._preprocess() + + # step 1: insert q-dq pairs + self.insert_qdq() + + self.remove_duplicate_qdq_paris() + + # step 2: convert q-node-dq to qoperator format if needed + if self.quant_format != "qdq": + self.convert_qdq_to_operator_oriented() + + self._postprocess() + quant_utils.dump_model_op_stats(self.model.model, self.config, self.op_types_to_quantize) + return self.model.model + + def merge_dedicated_qdq_pair(self): + """Merge dedicated Q/DQ pairs.""" + self.remove_nodes = [] + self.replace_input = [] + self.new_nodes = [] + if self.quant_format == "qdq" and self.dedicated_qdq_pair: + # node node + # | / \ + # q -> q q + # / \ / \ + # dq dq dq dq + for node in self.model.nodes(): + if node.op_type in ["QuantizeLinear"]: + children = self.model.get_children(node) + if len([i for i in children if i.op_type in ["DequantizeLinear"]]) < 2: + continue + for idx, child in enumerate(children): + if child.op_type not in ["DequantizeLinear"]: + continue + if self.should_quantize(self.model.get_children(child)[0]): + inputs = [self.model.get_parents(node)[0].output[0], node.input[1], node.input[2]] + self.new_nodes.append( + onnx.helper.make_node( + "QuantizeLinear", + inputs, + [node.output[0] + "_" + str(idx)], + node.name + "_" + str(idx), + ) + ) + self.replace_input.append([child, node.output[0], node.output[0] + "_" + str(idx)]) + else: + self.remove_nodes.append(child) + self.replace_input.append( + [self.model.get_children(child)[0], child.output[0], node.input[0]] + ) + self.remove_nodes.append(node) + self.model.remove_nodes(self.remove_nodes) + self.model.graph().node.extend(self.new_nodes) + for node, old_input_name, new_input_name in self.replace_input: + self.model.replace_node_input(node, old_input_name, new_input_name) + self.model.update() + + elif self.quant_format != "qdq" or not self.dedicated_qdq_pair: + # node node + # / \ -> | + # q(dq) q(dq) q(dq) + target_type = ["QuantizeLinear", "DequantizeLinear"] + for op_type in target_type: + for node in self.model.nodes(): + children = self.model.get_children(node) + dq_nodes = [i for i in children if i.op_type == op_type] + if len(dq_nodes) < 2 or node.op_type in ["Split"]: + continue + datas = [] + for n in dq_nodes: + datas.append( + [ + onnx.numpy_helper.to_array( + quant_utils.find_by_name(n.input[1], self.model.initializer()) + ), + onnx.numpy_helper.to_array( + quant_utils.find_by_name(n.input[2], self.model.initializer()) + ), + ] + ) + for idx, data in enumerate(datas): + repeaded_id = [i for i, item in enumerate(datas[idx:]) if item == data] + for i in repeaded_id[1:]: + self.remove_nodes.append(dq_nodes[i]) + self.replace_input.append( + [ + self.model.get_children(dq_nodes[i])[0], + dq_nodes[i].output[0], + dq_nodes[idx].output[0], + ] + ) + self.model.remove_nodes(self.remove_nodes) + self.model.graph().node.extend(self.new_nodes) + for node, old_input_name, new_input_name in self.replace_input: + self.model.replace_node_input(node, old_input_name, new_input_name) + self.model.update() + + def remove_duplicate_qdq_paris(self): + """Remove duplicated qdq pairs.""" + self.remove_nodes = [] + for node in self.model.nodes(): + if node.op_type == "DequantizeLinear": + matched_parents = self.model.match_parent_path( + node, + ["QuantizeLinear", "DequantizeLinear", "QuantizeLinear"], + [None, None, None], + ) + + if matched_parents is not None: + # (node) DQ - (matched_parents) Q-DQ-Q + if all( + [i.op_type == "QuantizeLinear" for i in self.model.get_children(matched_parents[1])] + ) and not self.model.is_graph_output(matched_parents[1].output[0]): + self.remove_nodes.append(matched_parents[1]) + if all([i.op_type == "DequantizeLinear" for i in self.model.get_children(matched_parents[0])]): + self.remove_nodes.append(matched_parents[0]) + self.replace_input.append([node, node.input[0], matched_parents[2].output[0]]) + + self.model.remove_nodes(self.remove_nodes) + for node, old_input_name, new_input_name in self.replace_input: + self.model.replace_node_input(node, old_input_name, new_input_name) + + def insert_qdq(self): + """Insert Q/DQ pairs.""" + for node in self.model.nodes(): + if self.should_quantize(node): + op_quantizer = base_op.OPERATORS[self.mode][node.op_type](self, node) + if op_quantizer.quantize_check(): + op_quantizer.quantize() + self.model.graph().node.extend(self.new_nodes) + self.model.remove_nodes(self.remove_nodes) + + for node, old_input_name, new_input_name in self.replace_input: + self.model.replace_node_input(node, old_input_name, new_input_name) + self.model.update() + + def convert_qdq_to_operator_oriented(self): + """Convert QDQ to QOperator format.""" + self.new_nodes = [] + self.remove_nodes = [] + self.replace_input = [] + for node in self.model.nodes(): + if node.op_type not in ["QuantizeLinear", "DequantizeLinear"] and self.should_convert(node): + op_converter = base_op.OPERATORS[self.mode][node.op_type](self, node) + if op_converter.convert_check(): + op_converter.convert() + self.model.graph().node.extend(self.new_nodes) + self.model.remove_nodes(self.remove_nodes) + for node, old_input_name, new_input_name in self.replace_input: + self.model.replace_node_input(node, old_input_name, new_input_name) + self.model.update() + + def quantize_bias_tensor(self, node): + """Quantize bias.""" + input_name, weight_name, bias_name = node.input + if ( + self.quantization_params is None + or input_name not in self.quantization_params + or input_name not in self.quantized_value_map + or ( + input_name in self.quantized_value_map + and quant_utils.find_by_name(self.quantized_value_map[input_name].scale_name, self.model.initializer()) + is None + ) + ): + self._dynamic_quantize_bias(input_name, weight_name + "_scale", bias_name, bias_name + "_quantized") + else: + beta = 1.0 + if node.op_type in ["Gemm"]: + beta_attribute = [attr for attr in node.attribute if attr.name == "beta"] + if len(beta_attribute): + beta = onnx.helper.get_attribute_value(beta_attribute[0]) + _, quant_value = self.quantize_bias(bias_name, input_name, weight_name, beta) + if self.model.get_initializer_share_num(bias_name) == 1: + self.model.remove_initializer(quant_utils.find_by_name(bias_name, self.model.initializer())) + inputs = [quant_value.q_name, quant_value.scale_name, quant_value.zp_name] + axis = None + if quant_utils.find_by_name(weight_name + "_DequantizeLinear", self.new_nodes): + dq_node = quant_utils.find_by_name(weight_name + "_DequantizeLinear", self.new_nodes) + if dq_node.op_type == "DequantizeLinear" and quant_utils.find_by_name("axis", dq_node.attribute): + axis = quant_utils.find_by_name("axis", dq_node.attribute).i + dequant_node = onnx.helper.make_node( + "DequantizeLinear", + inputs, + [bias_name + "_dequantized"], + bias_name + "_DequantizeLinear", + axis=axis, + ) + self.new_nodes.append(dequant_node) + self.replace_input.append( + [quant_utils.find_by_name(node.name, self.model.nodes()), bias_name, bias_name + "_dequantized"] + ) + + def quantize_bias(self, bias_name, input_name, weight_name, beta=1.0): + """Quantized the bias. + + Zero Point == 0 and Scale == Input_Scale * Weight_Scale + """ + # get scale for weight + weight_scale_initializer = quant_utils.find_by_name(weight_name + "_scale", self.model.initializer()) + weight_scale = ( + self.tensor_proto_to_array(weight_scale_initializer, os.path.dirname(self.model.model_path)) + if self.model.model_path is not None + else self.tensor_proto_to_array(weight_scale_initializer) + ) + + # get bias + bias_initializer = quant_utils.find_by_name(bias_name, self.model.initializer()) + bias_data = ( + self.tensor_proto_to_array(bias_initializer, os.path.dirname(self.model.model_path)) + if self.model.model_path is not None + else self.tensor_proto_to_array(bias_initializer) + ) + quantized_bias_name = bias_name + "_quantized" + + if input_name in self.quantized_value_map: + input_scale_name = self.quantized_value_map[input_name].scale_name + elif input_name in self.quantization_params: + _, input_scale_name, _, _, _ = self._get_quantization_params(input_name) + else: + raise ValueError(f"Expected {input_name} to be in quantized value map for static quantization") + inputscale_initializer = quant_utils.find_by_name(input_scale_name, self.model.initializer()) + input_scale = ( + self.tensor_proto_to_array(inputscale_initializer, os.path.dirname(self.model.model_path)) + if self.model.model_path is not None + else self.tensor_proto_to_array(inputscale_initializer) + ) + + # calculate scale for bias + + bias_scale = input_scale * weight_scale * beta + + # quantize bias + quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32) + + # update bias initializer + bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims) + packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name) + self.model.initializer().extend([packed_bias_initializer]) + + # update scale initializer + quantized_bias_scale_name = bias_name + "_scale" + bias_scale_data = np.asarray(bias_scale, dtype=np.float32).reshape(-1) + packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name) + self.model.initializer().extend([packed_bias_scale_initializer]) + + # update zero initializer + quantized_bias_zp_name = bias_name + "_zero_point" + bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1) + packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name) + self.model.initializer().extend([packed_bias_zp_initializer]) + + quantized_value = quant_utils.QuantizedValue( + bias_name, + quantized_bias_name, + quantized_bias_scale_name, + quantized_bias_zp_name, + None, + onnx.TensorProto.INT32, + ) + return quantized_bias_name, quantized_value + + def quantize_weight_per_channel(self, weight_name, weight_qType, sym, channel_axis): + """Quantize weight per-channel.""" + name = ( + ("_").join([weight_name, str(weight_qType)]) + if self.model.get_initializer_share_num(weight_name) > 1 + else weight_name + ) + if name in self.quantized_value_map: + return (name + "_quantized", name + "_zero_point", name + "_scale") + + initializer = quant_utils.find_by_name(weight_name, self.model.initializer()) + if initializer is None: + raise ValueError("{} is not an initializer", weight_name) + + weights = ( + self.tensor_proto_to_array(initializer, os.path.dirname(self.model.model_path)) + if self.model.model_path is not None + else self.tensor_proto_to_array(initializer) + ) + rmin, rmax, zero_point, scale, quantized_weights = quant_utils.quantize_data_per_channel( + weights, + channel_axis, + weight_qType, + sym, + self.reduce_range, + ) + + weight = quant_utils.QuantizedInitializer( + name, + initializer, + rmin, + rmax, + zero_point, + scale, + weights, + quantized_weights.flatten().tolist(), + channel_axis, + weight_qType, + ) + + self._update_weight(weight) + quantized_value = quant_utils.QuantizedValue( + weight.name, + weight.name + "_quantized", + weight.name + "_scale", + weight.name + "_zero_point", + None, + weight_qType, + ) + self.quantized_value_map[weight.name] = quantized_value + + return (weight.name + "_quantized", weight.name + "_zero_point", weight.name + "_scale") + + def dequantize_tensor(self, node, value_name): + """Dequantize tensor.""" + if value_name in self.quantized_value_map: + quantized_value = self.quantized_value_map[value_name] + dqlinear_name = value_name + "_DequantizeLinear" + dqlinear_inputs = [value_name + "_quantized", quantized_value.scale_name, quantized_value.zp_name] + dequantize_node = onnx.helper.make_node("DequantizeLinear", dqlinear_inputs, [value_name], dqlinear_name) + if dequantize_node not in self.new_nodes: + self.new_nodes.append(dequantize_node) + else: # pragma: no cover + data_found, scale_name, zp_name, _, _ = self._get_quantization_params(value_name) + if self.static: + if data_found is False: + raise ValueError( + "Quantization parameters are not specified for param {}." + "In static mode quantization params for inputs and outputs " + "of nodes to be quantized are required.".format(value_name) + ) + dqlinear_name = value_name + "_DequantizeLinear" + dqlinear_inputs = [value_name + "_quantized", scale_name, zp_name] + dequantize_node = onnx.helper.make_node("DequantizeLinear", dqlinear_inputs, [value_name], dqlinear_name) + if dequantize_node not in self.new_nodes: + self.new_nodes.append(dequantize_node) + + def _update_weight(self, weight): + """Update weight. + + Given a weight object, update the graph by doing the following: + - remove old initializer, update new initializers for + quantized weight, zero point, and scale + - remove old weight input, update with new inputs for + quantized weight, zero point, and scale + This function does NOT update the nodes in the graph, just initializers and inputs + """ + if weight.name in self.quantized_value_map: + return + packed_weight_name = weight.name + "_quantized" + scale_name = weight.name + "_scale" + zero_point_name = weight.name + "_zero_point" + + # Update packed weight, zero point, and scale initializers + packed_weight_np_data = np.asarray( + weight.quantized_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(weight.qType) + ).reshape(weight.initializer.dims) + packed_weight_initializer = onnx.numpy_helper.from_array(packed_weight_np_data, packed_weight_name) + + if not self.add_qdq_pair_to_weight or self.quant_format != "qdq": + self.model.initializer().append(packed_weight_initializer) + if weight.axis is not None: + zero_scale_shape = [weight.initializer.dims[weight.axis]] + else: # scale and zero point must be scalar + zero_scale_shape = [] + zero_point_type = weight.qType + scale_initializer = onnx.helper.make_tensor( + scale_name, weight.initializer.data_type, zero_scale_shape, weight.scales + ) + zero_initializer = onnx.helper.make_tensor( + zero_point_name, zero_point_type, zero_scale_shape, weight.zero_points + ) + + self.model.initializer().extend([scale_initializer, zero_initializer]) + + @staticmethod + def tensor_proto_to_array(initializer, base_dir=""): + """Convert TensorProto to array.""" + if quant_utils.is_quantizable_type(initializer.data_type): + weights = onnx.numpy_helper.to_array(initializer, base_dir) + else: + raise ValueError( + "Only float type quantization is supported. \ + Weights {} is {}.".format( + initializer.name, + str(onnx.helper.tensor_dtype_to_np_dtype(initializer.data_type)), + ) + ) + return weights + + def _get_quantization_params(self, param_name): + """Create initializers and inputs in the graph for zero point and scale of output. + + Zero point and scale values are obtained from self.quantization_params if specified. + + Args: + param_name (string): Name of the quantization parameter. + """ + if self.quantization_params is None or param_name not in self.quantization_params: + return False, "", "", "", "" + + params = self.quantization_params[param_name] + if params is None or len(params) != 2: + raise ValueError( + "Quantization parameters should contain zero point and scale. " + "Specified values for output {}: {}".format(param_name, params) + ) + + zero_point_values = [params[0]] + zero_point_shape = [] + zero_point_name = param_name + "_zero_point" + zero_point_type = onnx.helper.np_dtype_to_tensor_dtype(params[0].dtype) + + scale_values = [params[1]] + scale_shape = [] + scale_name = param_name + "_scale" + scale_dtype = onnx.helper.np_dtype_to_tensor_dtype(params[1].dtype) + + # Add initializers + init_zp = onnx.helper.make_tensor(zero_point_name, zero_point_type, zero_point_shape, zero_point_values) + self.model.add_initializer(init_zp) + init_scale = onnx.helper.make_tensor(scale_name, scale_dtype, scale_shape, scale_values) + self.model.add_initializer(init_scale) + + return True, scale_name, zero_point_name, scale_shape, zero_point_shape + + def _get_quantized_weight(self, initializer, qType, sym): + """Get quantized weight.""" + name = ( + ("_").join([initializer.name, str(qType)]) + if self.model.get_initializer_share_num(initializer.name) > 1 + else initializer.name + ) + if name in self.quantized_value_map: + return self.quantized_value_map[name] + weights_data = ( + self.tensor_proto_to_array(initializer, os.path.dirname(self.model.model_path)) + if self.model.model_path is not None + else self.tensor_proto_to_array(initializer) + ) + rmin, rmax, zero_point, scale, quantized_weights_data = quant_utils.quantize_data( + weights_data.flatten().tolist(), + qType, + sym, + self.reduce_range, + ) + weight = quant_utils.QuantizedInitializer( + name, + initializer, + [rmin], + [rmax], + [zero_point], + [scale], + weights_data, + quantized_weights_data, + axis=None, + qType=qType, + ) + + return weight + + def is_valid_quantize_weight(self, weight_name): + """Check weight can be quantized.""" + weight = quant_utils.find_by_name(weight_name, self.model.initializer()) + if weight is not None: + return quant_utils.is_quantizable_type(weight.data_type) + else: + return weight_name in self.quantized_value_map + + def get_bias_add_nodes(self, node, weight_name, last_output, quantized_bias_name): + """Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node. + + Args: + node (NodeProto): current node (Conv) + weight_name (string): weight name + last_output (_type_): output of previous node (input to bias add) + quantized_bias_name (string): bias name + """ + # Add tensors for the shape to be reshaped to + weight = quant_utils.find_by_name(weight_name, self.model.initializer()) + if weight is None: + raise ValueError("Expected {} to be an initializer".format(node.input[1])) + + # Add reshape for correct broadcast + reshape_input_data = quantized_bias_name + reshape_input_shape = quantized_bias_name + "_reshape_shape" + reshape_input = [reshape_input_data, reshape_input_shape] + reshape_shape = np.ones((len(weight.dims)), dtype=np.int64) + reshape_shape[1] = -1 + init_shape = onnx.helper.make_tensor( + reshape_input_shape, onnx.TensorProto.INT64, [len(weight.dims)], reshape_shape + ) + self.model.add_initializer(init_shape) + + reshape_op_output = node.output[0] + "_reshape" + reshape_node = onnx.helper.make_node( + "Reshape", reshape_input, [reshape_op_output], quantized_bias_name + "reshape" + ) + self.new_nodes.append(reshape_node) + + # Add an Add operation for bias + bias_add_input = [last_output] + bias_add_input.append(reshape_op_output) + add_node_output = node.output[0] + "_bias_add" + add_node = onnx.helper.make_node("Add", bias_add_input, [add_node_output], quantized_bias_name + "bias_add") + self.new_nodes.append(add_node) + return add_node_output + + def quantize_outputs(self, node, initializer_use_weight_qType=True, direct_int8=False): + """Quantize node outputs.""" + for idx, tensor_name in enumerate(node.output): + if ( + tensor_name in self.value_infos + and self.value_infos[tensor_name].type.HasField("tensor_type") + and not quant_utils.is_quantizable_type(self.value_infos[tensor_name].type.tensor_type.elem_type) + ): + return + data_found = False + refer_name = node.input[0] if direct_int8 else tensor_name + + if refer_name in self.quantized_value_map: + scale_name = self.quantized_value_map[refer_name].scale_name + zp_name = self.quantized_value_map[refer_name].zp_name + data_found = True + elif refer_name in self.quantization_params: + data_found, scale_name, zp_name, _, _ = self._get_quantization_params(refer_name) + + if data_found is False: + raise ValueError( + "Quantization parameters are not specified for param {}." + "In static mode quantization params for inputs and outputs " + "of nodes to be quantized are required.".format(tensor_name) + ) + + node.output[idx] = tensor_name + "_QuantizeInput" + q_input = node.output[idx] + q_output = tensor_name + "_quantized" + dq_input = q_output + dq_output = tensor_name + quant_node_name = tensor_name + "_" + node.name + "_QuantizeLinear" + dequant_node_name = tensor_name + "_" + node.name + "_DequantizeLinear" + qlinear_node = onnx.helper.make_node( + "QuantizeLinear", + [q_input, scale_name, zp_name], + [q_output], + quant_node_name, + ) + dequant_node = onnx.helper.make_node( + "DequantizeLinear", + [dq_input, scale_name, zp_name], + [dq_output], + dequant_node_name, + ) + self.new_nodes.extend([qlinear_node, dequant_node]) + for child in self.model.get_children(node): + self.replace_input.append([child, tensor_name, dequant_node.output[0]]) + if tensor_name not in self.quantized_value_map: + quantized_value = quant_utils.QuantizedValue(tensor_name, dq_output, scale_name, zp_name) + self.quantized_value_map[tensor_name] = quantized_value + + def quantize_inputs(self, node, indices=None, initializer_use_weight_qType=True, direct_int8=False): + """Quantize node inputs.""" + # Quantize the input + for idx, tensor_name in enumerate(node.input): + if indices and idx not in indices: + continue + initializer = quant_utils.find_by_name(tensor_name, self.model.initializer()) + if initializer is not None: + if not quant_utils.is_quantizable_type(initializer.data_type): + return + + dtype = ( + self.config[node.name]["weight_type"] + if initializer_use_weight_qType + else self.config[node.name]["activation_type"] + ) + sym = ( + self.config[node.name]["weight_sym"] + if initializer_use_weight_qType + else self.config[node.name]["activation_sym"] + ) + weight = self._get_quantized_weight(initializer, dtype, sym) + self._update_weight(weight) + node.input[idx] = weight.name + q_weight_name = weight.name + "_quantized" + zp_name = weight.name + "_zero_point" + scale_name = weight.name + "_scale" + + if self.add_qdq_pair_to_weight and self.quant_format == "qdq": + qlinear_node = onnx.helper.make_node( + "QuantizeLinear", + [tensor_name, scale_name, zp_name], + [weight.name + "_quantized"], + weight.name + "_QuantizeLinear", + ) + self.new_nodes.append(qlinear_node) + + dequant_node = onnx.helper.make_node( + "DequantizeLinear", + [q_weight_name, scale_name, zp_name], + [weight.name + "_dequantized"], + weight.name + "_DequantizeLinear", + ) + self.new_nodes.append(dequant_node) + self.replace_input.append([node, weight.name, dequant_node.output[0]]) + if weight.name not in self.quantized_value_map: + quantized_value = quant_utils.QuantizedValue( + weight.name, + q_weight_name, + scale_name, + zp_name, + None, + dtype, + ) + self.quantized_value_map[weight.name] = quantized_value + else: + if ( + tensor_name in self.value_infos + and self.value_infos[tensor_name].type.HasField("tensor_type") + and not quant_utils.is_quantizable_type(self.value_infos[tensor_name].type.tensor_type.elem_type) + ): + return + self._quantize_activation(node, tensor_name, direct_int8) + + def quantize_weights_per_channel(self, node, indices, weight_qType, sym, axis): + """Quantize weights per-channel.""" + if self.opset_version < 13 and self.quant_format == "qdq": + self.quantize_inputs(node, indices) + return + + for idx, inp in enumerate(node.input): + if idx not in indices: + continue + + q_name, zp_name, scale_name = self.quantize_weight_per_channel(inp, weight_qType, sym, axis) + weight_name = ("_").join([inp, str(weight_qType)]) if self.model.get_initializer_share_num(inp) > 1 else inp + dequant_node = onnx.helper.make_node( + "DequantizeLinear", + [q_name, scale_name, zp_name], + [weight_name + "_dequantized"], + weight_name + "_DequantizeLinear", + axis=axis, + ) + self.new_nodes.append(dequant_node) + node.input[idx] = weight_name + + # Replace weight_name with output of DequantizeLinear + self.replace_input.append([node, weight_name, dequant_node.output[0]]) + + if self.add_qdq_pair_to_weight and self.quant_format == "qdq": + qlinear_node = onnx.helper.make_node( + "QuantizeLinear", + [inp, scale_name, zp_name], + [q_name], + weight_name + "_QuantizeLinear", + axis=axis, + ) + self.new_nodes.append(qlinear_node) + + +class StaticQuantizer(Quantizer): + """Static quantizer class.""" + + def __init__( + self, + model, + q_config, + quant_format="qoperator", + quantization_params={}, + op_types_to_quantize=[], + fallback_list=["fp32"], + reduce_range=None, + add_qdq_pair_to_weight=False, + optypes_to_exclude_output_quant=[], + dedicated_qdq_pair=False, + execution_provider="CPUExecutionProvider", + ): + """Initialization. + + Args: + model (ModelProto or ONNXModel): onnx model or onnx model wrapper by neural compressor + q_config (dict): op-wise quantization config. + static (bool): static or not + quantization_params (dict): scale and zero point of tensors + op_types_to_quantize (list): optypes to quantize + fallback_list (list, optional): fallback data type. Defaults to ['fp32']. + reduce_range (bool, optional): use 7 bit or not. Defaults to None. + add_qdq_pair_to_weight (bool, optional): add QDQ pair to weight or not. Defaults to False. + optypes_to_exclude_output_quant (list, optional): optypes to exclude output quantization. Defaults to []. + dedicated_qdq_pair (bool, optional): dedicate QDQ pair or not. Defaults to False. + execution_provider (str, optional): execution_provider of onnxrt adaptor. Defaults to CPUExecutionProvider + """ + super().__init__( + mode="static_quant", + model=model, + q_config=q_config, + static=True, + quantization_params=quantization_params, + op_types_to_quantize=op_types_to_quantize, + ) + self.fallback_list = fallback_list + self.reduce_range = reduce_range + self.add_qdq_pair_to_weight = add_qdq_pair_to_weight + self.optypes_to_exclude_output_quant = optypes_to_exclude_output_quant + self.dedicated_qdq_pair = dedicated_qdq_pair + self.execution_provider = execution_provider + self.static = True # use static quantization for inputs. + self.quant_format = quant_format + if self.opset_version < 13 and self.quant_format == "qdq": + logger.warning( + "Per-channel support with QDQ format requires opset version >= 13," + " use per-tensor granularity instead" + ) + if "TensorrtExecutionProvider" in execution_provider: + + # TensorrtExecutionProvider doesn't support Conv + Add fusion + self._revert_conv_add_fusion() + + # only quantize Add which is followed by ReduceMean + for node in self.model.nodes(): + if node.op_type == "Add": + children = self.model.get_children(node) + if "ReduceMean" not in [i.op_type for i in children]: + self.config[node.name] = "fp32" + + def _revert_conv_add_fusion(self): + add_nodes = [] + remove_nodes = [] + for node in self.model.nodes(): + if node.op_type == "Conv" and len(node.input) == 3: + bias_tensor = self.model.get_initializer(node.input[2]) + bias_array = onnx.numpy_helper.to_array(bias_tensor).reshape((-1, 1, 1)) + self.model.remove_initializer(bias_tensor) + self.model.add_initializer(onnx.numpy_helper.from_array(bias_array, bias_tensor.name)) + kwargs = {} + activation_params = None + for attr in node.attribute: + kwargs.update(quant_utils.attribute_to_kwarg(attr)) + conv = onnx.helper.make_node("Conv", node.input[0:2], [node.name + "_revert"], node.name, **kwargs) + add = onnx.helper.make_node("Add", [conv.output[0], node.input[2]], node.output, node.name + "_add") + add_nodes.extend([conv, add]) + + self.model.remove_nodes(remove_nodes) + self.model.add_nodes(add_nodes) + self.model.update() + + def _quantize_activation(self, node, tensor_name, direct_int8=False): + """Quantize node activation.""" + if tensor_name in self.quantized_value_map: + scale_name = self.quantized_value_map[tensor_name].scale_name + zp_name = self.quantized_value_map[tensor_name].zp_name + data_found = True + else: + data_found, scale_name, zp_name, _, _ = self._get_quantization_params(tensor_name) + + if data_found is False: + raise ValueError( + "Quantization parameters are not specified for param {}." + "In static mode quantization params for inputs and outputs " + "of nodes to be quantized are required.".format(tensor_name) + ) + + if direct_int8: + # direct int8 models will be quantized only if their inputs are quantized + if node.input[0] not in self.quantized_value_map: + return + + q_input = tensor_name + q_output = ( + tensor_name + "_" + node.name + "_QuantizeLinear" + if tensor_name not in self.model.input() + else tensor_name + "_quantized" + ) + dq_input = q_output + dq_output = ( + tensor_name + "_" + node.name + "_dequantized" + if tensor_name not in self.model.input() + else tensor_name + "_dequantized" + ) + self.replace_input.append([node, tensor_name, dq_output]) + + if tensor_name in self.model.input() and tensor_name in self.quantized_value_map: + return + + quant_node_name = tensor_name + "_" + node.name + "_QuantizeLinear" + dequant_node_name = tensor_name + "_" + node.name + "_DequantizeLinear" + qlinear_node = onnx.helper.make_node( + "QuantizeLinear", + [q_input, scale_name, zp_name], + [q_output], + quant_node_name, + ) + dequant_node = onnx.helper.make_node( + "DequantizeLinear", + [dq_input, scale_name, zp_name], + [dq_output], + dequant_node_name, + ) + self.new_nodes.extend([qlinear_node, dequant_node]) + + if tensor_name not in self.quantized_value_map: + quantized_value = quant_utils.QuantizedValue( + tensor_name, + dq_output, + scale_name, + zp_name, + ) + self.quantized_value_map[tensor_name] = quantized_value + + +class DynamicQuantizer(Quantizer): + """Dynamic quantizer class.""" + + def __init__( + self, + model, + q_config, + quantization_params={}, + op_types_to_quantize=[], + fallback_list=["fp32"], + reduce_range=None, + execution_provider="CPUExecutionProvider", + ): + """Initialization. + + Args: + model (ModelProto or onnx_model.ONNXModel): onnx model or onnx model wrapper by neural compressor + q_config (dict): op-wise quantization config. + quantization_params (dict): scale and zero point of tensors + op_types_to_quantize (list): optypes to quantize + fallback_list (list, optional): fallback data type. Defaults to ['fp32']. + reduce_range (bool, optional): use 7 bit or not. Defaults to None. + add_qdq_pair_to_weight (bool, optional): add QDQ pair to weight or not. Defaults to False. + dedicated_qdq_pair (bool, optional): dedicate QDQ pair or not. Defaults to False. + execution_provider (str, optional): execution_provider of onnxrt adaptor. Defaults to CPUExecutionProvider + """ + super().__init__( + mode="dynamic_quant", + model=model, + q_config=q_config, + static=False, + quantization_params=quantization_params, + op_types_to_quantize=op_types_to_quantize, + ) + + def _quantize_activation(self, node, tensor_name, direct_int8=False): + """Quantize node activation.""" + qlinear_node = None + if quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.model.nodes()) is not None: + qlinear_node = quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.model.nodes()) + elif quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.new_nodes) is not None: + qlinear_node = quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.new_nodes) + if qlinear_node is None: + if ( + self.fuse_dynamic_quant + and self.config[node.name]["activation_type"] == onnx.TensorProto.UINT8 + and not self.config[node.name]["activation_sym"] + ): + # DynamicQuantizeLinear supports uint8 input for CPU EP, supports uint8 and int8 for DML EP + scale_name = tensor_name + "_scale" + zp_name = tensor_name + "_zero_point" + if quant_utils.find_by_name(scale_name, self.model.initializer()): + self.model.remove_initializer(quant_utils.find_by_name(scale_name, self.model.initializer())) + if quant_utils.find_by_name(zp_name, self.model.initializer()): + self.model.remove_initializer(quant_utils.find_by_name(zp_name, self.model.initializer())) + qlinear_node = onnx.helper.make_node( + "DynamicQuantizeLinear", + [tensor_name], + [tensor_name + "_dynamic_quantized", scale_name, zp_name], + tensor_name + "_QuantizeLinear", + ) + else: + scale_name, zp_name, _, _ = self._get_dynamic_input_quantization_params( + tensor_name, self.config[node.name]["activation_type"] + ) + qlinear_node = onnx.helper.make_node( + "QuantizeLinear", + [tensor_name, scale_name, zp_name], + [tensor_name + "_quantized"], + tensor_name + "_QuantizeLinear", + ) + if qlinear_node not in self.new_nodes: + self.new_nodes.append(qlinear_node) + self.quantized_value_map[tensor_name] = quant_utils.QuantizedValue( + tensor_name, + qlinear_node.output[0], + scale_name, + zp_name, + self.config[node.name]["activation_type"], + ) + self.replace_input.append([node, tensor_name, qlinear_node.output[0]]) + + def _get_dynamic_input_quantization_params(self, input_name, qType): + """Create nodes for dynamic quantization of input. + + Args: + input_name (string): Name of the input. + qType (int): type to quantize to. + """ + if qType == onnx.TensorProto.INT8: + return self._get_dynamic_input_quantization_params_int8(input_name) + + return self._get_dynamic_input_quantization_params_uint8(input_name) + + def _get_dynamic_input_quantization_params_int8(self, input_name): # pragma: no cover + """Create nodes for dynamic quantization of input to int8. + + Args: + input_name (string): Name of the input. + """ + qType = onnx.TensorProto.INT8 + + # Reduce min and Reduce max + input_scale_name = input_name + "_scale" + + reduce_min_name = input_name + "_ReduceMin" + reduce_min_node = onnx.helper.make_node( + "ReduceMin", + [input_name], + [reduce_min_name + ":0"], + reduce_min_name, + keepdims=0, + ) + self.new_nodes.append(reduce_min_node) + + reduce_max_name = input_name + "_ReduceMax" + reduce_max_node = onnx.helper.make_node( + "ReduceMax", + [input_name], + [reduce_max_name + ":0"], + reduce_max_name, + keepdims=0, + ) + self.new_nodes.append(reduce_max_node) + + # Compute scale + # Find abs(rmin) + reduce_min_abs_name = reduce_min_name + "_Abs" + reduce_min_abs_node = onnx.helper.make_node( + "Abs", + [reduce_min_node.output[0]], + [reduce_min_abs_name + ":0"], + reduce_min_abs_name, + ) + self.new_nodes.append(reduce_min_abs_node) + # Find abs(rmax) + reduce_max_abs_name = reduce_max_name + "_Abs" + reduce_max_abs_node = onnx.helper.make_node( + "Abs", + [reduce_max_node.output[0]], + [reduce_max_abs_name + ":0"], + reduce_max_abs_name, + ) + self.new_nodes.append(reduce_max_abs_node) + # Compute max of abs(rmin) and abs(rmax) + abs_max_name = input_name + "_Abs_Max" + abs_max_node = onnx.helper.make_node( + "Max", + [reduce_min_abs_node.output[0], reduce_max_abs_node.output[0]], + [abs_max_name + ":0"], + abs_max_name, + ) + self.new_nodes.append(abs_max_node) + # and divide by (quantize_range/2.0) which will be equal to max(...)*2.0/quantize_range + qmin, qmax = quant_utils.get_qmin_qmax_for_qType(qType, self.reduce_range) + initializer_div = onnx.helper.make_tensor( + self.fixed_qrange_int8_name, + onnx.TensorProto.FLOAT, + [], + [(qmax - qmin) / 2.0], + ) + self.model.add_initializer(initializer_div) + scale_div_name = input_name + "scale_Div" + scale_div_node = onnx.helper.make_node( + "Div", + [abs_max_node.output[0], self.fixed_qrange_int8_name], + [input_scale_name], + scale_div_name, + ) + self.new_nodes.append(scale_div_node) + + # Zero point + initializer_zp = onnx.helper.make_tensor(self.fixed_zero_zp_name, qType, [], [0]) + self.model.add_initializer(initializer_zp) + + return input_scale_name, self.fixed_zero_zp_name, [], [] + + def _get_dynamic_input_quantization_params_uint8(self, input_name): + """Create nodes for dynamic quantization of input to uint8. + + Args: + input_name (string): Name of the input. + """ + qType = onnx.TensorProto.UINT8 + # Reduce min and Reduce max + input_scale_name = input_name + "_scale" + input_zp_name = input_name + "_zero_point" + + reduce_min_name = input_name + "_ReduceMin" + reduce_min_node = onnx.helper.make_node( + "ReduceMin", + [input_name], + [reduce_min_name + ":0"], + reduce_min_name, + keepdims=0, + ) + self.new_nodes.append(reduce_min_node) + + reduce_max_name = input_name + "_ReduceMax" + reduce_max_node = onnx.helper.make_node( + "ReduceMax", + [input_name], + [reduce_max_name + ":0"], + reduce_max_name, + keepdims=0, + ) + self.new_nodes.append(reduce_max_node) + + # Add tensors for quantize range and zero value. + qmin, qmax = quant_utils.get_qmin_qmax_for_qType(qType, self.reduce_range) + initializer_qrange = onnx.helper.make_tensor( + self.fixed_qrange_uint8_name, + onnx.TensorProto.FLOAT, + [], + [qmax - qmin], + ) + self.model.add_initializer(initializer_qrange) + initializer_qvalue = onnx.helper.make_tensor(self.fixed_zero_name, onnx.TensorProto.FLOAT, [], [0.0]) + self.model.add_initializer(initializer_qvalue) + + # Compute Scale + # Subtract rmax and rmin + scale_sub_name = input_name + "_scale_Sub" + scale_sub_node = onnx.helper.make_node( + "Sub", + [reduce_max_node.output[0], reduce_min_node.output[0]], + [scale_sub_name + ":0"], + scale_sub_name, + ) + self.new_nodes.append(scale_sub_node) + # and divide by quantize range + scale_div_name = input_name + "_scale_Div" + scale_div_node = onnx.helper.make_node( + "Div", + [scale_sub_node.output[0], self.fixed_qrange_uint8_name], + [input_scale_name], + scale_div_name, + ) + self.new_nodes.append(scale_div_node) + + # Compute zero point + # Subtract zero and rmin + zp_sub_name = input_name + "_zero_point_Sub" + zp_sub_node = onnx.helper.make_node( + "Sub", + [self.fixed_zero_name, reduce_min_node.output[0]], + [zp_sub_name + ":0"], + zp_sub_name, + ) + self.new_nodes.append(zp_sub_node) + # Divide by scale + zp_div_name = input_name + "_zero_point_Div" + zp_div_node = onnx.helper.make_node( + "Div", + [zp_sub_node.output[0], input_scale_name], + [zp_div_name + ":0"], + zp_div_name, + ) + self.new_nodes.append(zp_div_node) + # Compute floor + zp_floor_name = input_name + "_zero_point_Floor" + zp_floor_node = onnx.helper.make_node("Floor", zp_div_node.output, [zp_floor_name + ":0"], zp_floor_name) + self.new_nodes.append(zp_floor_node) + # Cast to integer + zp_cast_name = input_name + "_zero_point_Cast" + zp_cast_node = onnx.helper.make_node("Cast", zp_floor_node.output, [input_zp_name], zp_cast_name, to=qType) + self.new_nodes.append(zp_cast_node) + + return input_scale_name, input_zp_name, [], [] diff --git a/onnx_neural_compressor/algorithms/smoother/core.py b/onnx_neural_compressor/algorithms/smoother/core.py index d21641482..bcf830f1a 100644 --- a/onnx_neural_compressor/algorithms/smoother/core.py +++ b/onnx_neural_compressor/algorithms/smoother/core.py @@ -22,22 +22,12 @@ import onnxruntime as ort from onnx_neural_compressor import data_reader, logger, onnx_model, utility +from onnx_neural_compressor.algorithms import utility as quant_utils from onnx_neural_compressor.algorithms.smoother import calibrator from typing import List, Union # isort: skip -_dtype_map = { - np.dtype("float32"): 1, - np.dtype("uint8"): 2, - np.dtype("int8"): 3, - np.dtype("int32"): 6, - np.dtype("int64"): 7, - np.dtype("float16"): 10, - np.dtype("double"): 11, -} - - def _get_quant_dequant_output(model, input_data, output_data, providers): """Get loss between fp32 output and QDQ output. @@ -47,7 +37,7 @@ def _get_quant_dequant_output(model, input_data, output_data, providers): output_data (numpy.ndarray): fp32 output providers (list): execution provider """ - input_data = _quant_dequant_data(input_data, 2, "asym") + input_data = quant_utils.qdq_data(input_data, 2, False) sess = ort.InferenceSession(model.SerializeToString(), providers=providers) preds = sess.run(None, {model.graph.input[0].name: input_data}) loss = np.sum(np.abs(output_data - preds) ** 2) @@ -65,28 +55,22 @@ def _make_sub_graph(node, inits, input_data, output_data, opset, ir_version): opset (object): opset of the model ir_version (object): ir_version of the model """ - input = onnx.helper.make_tensor_value_info(node.input[0], _dtype_map[input_data.dtype], input_data.shape) - output = onnx.helper.make_tensor_value_info(node.output[0], _dtype_map[output_data.dtype], output_data.shape) + input = onnx.helper.make_tensor_value_info( + node.input[0], + onnx.helper.np_dtype_to_tensor_dtype(input_data.dtype), + input_data.shape, + ) + output = onnx.helper.make_tensor_value_info( + node.output[0], + onnx.helper.np_dtype_to_tensor_dtype(output_data.dtype), + output_data.shape, + ) graph = onnx.helper.make_graph([node], "sub_graph", [input], [output], inits) model = onnx.helper.make_model(graph, opset_imports=opset) model.ir_version = ir_version return model -def _quant_dequant_data(data, qType=3, scheme="sym"): - """Quantize and then dequantize data. - - Args: - data (numpy.ndarray): target data - qType (int): data type - scheme (str): sym or asym quantization - """ - rmin, rmax, zero_point, scale, quantized_data = utility.quantize_data( - data.flatten().tolist(), utility.get_qrange_for_qType(qType, False), qType, scheme - ) - return ((quantized_data - zero_point) * scale).astype(data.dtype).reshape(data.shape) - - class Smoother: """Fake input channel quantization. @@ -102,7 +86,7 @@ def __init__( self, model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], dataloader: data_reader.CalibrationDataReader, - providers: List[str] = ["CPUExecutionProvider"], + execution_provider: str = "CPUExecutionProvider", ): """Initialize the attributes of class.""" self.model = ( @@ -112,7 +96,7 @@ def __init__( self.value_infos.update({ot.name: ot for ot in self.model.model.graph.output}) self.value_infos.update({it.name: it for it in self.model.model.graph.input}) self.dataloader = dataloader - self.providers = providers + self.providers = [execution_provider] self.tensor_scales_info = {} self.new_added_mul_nodes = [] self.new_added_value_info = [] @@ -204,7 +188,7 @@ def _dump_op_info(self, percentile, op_types, iterations): self.model, self.dataloader, iterations=list(range(0, iterations)), - backend=self.providers, + execution_provider=self.providers, ) self.max_vals_per_channel, self.shape_info, self.tensors_to_node = sq_calibrator.calib_smooth( @@ -382,7 +366,7 @@ def _get_output_loss(self, node_name, scale, calib_iter): ) base_dir = "" if not self.model.is_large_model else os.path.dirname(self.model.model_path) weight = onnx.numpy_helper.to_array(self.model.get_initializer(node.input[1]), base_dir) - weight_q = _quant_dequant_data(weight) + weight_q = quant_utils.qdq_data(weight, 3, True) self.model.set_initializer(node.input[1], weight_q) inits = [self.model.get_initializer(i) for i in node.input if self.model.get_initializer(i) is not None] @@ -468,7 +452,7 @@ def _auto_tune_alpha( self._adjust_weights(scale) input_scale = ( self._reshape_scale_for_input(tensor_name, key) - if not (node.op_type == "Gemm" and utility.is_B_transposed(node)) + if not (node.op_type == "Gemm" and quant_utils.is_B_transposed(node)) else self.tensor_scales_info[key] ) loss = self._get_output_loss(node_info[0], input_scale, calib_iter) @@ -505,7 +489,6 @@ def _get_smooth_scales(self, alpha, target_list=[]): Returns: the smooth scales for weights, currently one input tensor only have one scale """ - logger.info("Start smooth scales collection.") scales = {} for tensor, nodes in self.tensors_to_node.items(): # if scales_per_op the key of scales is the node name, otherwise the activation of node @@ -519,7 +502,7 @@ def _get_smooth_scales(self, alpha, target_list=[]): base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "", ) if (len(weight.shape) == 4 and weight.shape[1] != 1) or ( - node.op_type == "Gemm" and utility.is_B_transposed(node) + node.op_type == "Gemm" and quant_utils.is_B_transposed(node) ): weight = np.moveaxis(weight, 0, 1) specific_alpha = alpha[node_info[0]] if isinstance(alpha, dict) else alpha @@ -535,7 +518,7 @@ def _get_smooth_scales(self, alpha, target_list=[]): base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "", ) if (len(weight.shape) == 4 and weight.shape[1] != 1) or ( - node.op_type == "Gemm" and utility.is_B_transposed(node) + node.op_type == "Gemm" and quant_utils.is_B_transposed(node) ): weight = np.moveaxis(weight, 0, 1) weight = weight.reshape(weight.shape[0], -1) @@ -588,7 +571,7 @@ def _insert_smooth_mul_op(self, scales): name = key + "_" + "smooth_scale" scale_tensor = onnx.helper.make_tensor( name=key + "_" + "smooth_scale", - data_type=onnx.onnx_pb.TensorProto.FLOAT, + data_type=onnx.TensorProto.FLOAT, dims=scale_factor.shape, vals=scale_factor.flatten().tolist(), ) @@ -632,7 +615,7 @@ def _adjust_weights(self, scales): if len(weight.shape) == 2: scale = ( np.expand_dims(scales[key], axis=0) - if node.op_type == "Gemm" and utility.is_B_transposed(node) + if node.op_type == "Gemm" and quant_utils.is_B_transposed(node) else np.expand_dims(scales[key], axis=-1) ) new_weight = weight * scale diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py new file mode 100644 index 000000000..d802dc04d --- /dev/null +++ b/onnx_neural_compressor/algorithms/utility.py @@ -0,0 +1,702 @@ +# Copyright (c) 2023 MIT HAN Lab +# This source code is licensed under the MIT license +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import struct +import sys +from importlib import util + +import numpy as np +from packaging import version + +from onnx_neural_compressor import constants, utility + +if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): # pragma: no cover + import onnxruntime_extensions + +onnx = utility.LazyImport("onnx") +ort = utility.LazyImport("onnxruntime") + +__producer__ = "onnx.quantize" +__version__ = "0.1.0" +onnx_domain = "ai.onnx" +ms_domain = "com.microsoft" +QUANT_OP_NAME_SUFFIX = "_quant" + + +def attribute_to_kwarg(attribute): + """Convert attribute to kwarg format for use with onnx.helper.make_node.""" + attribute_mapping = { + 1: attribute.f, + 2: attribute.i, + 3: attribute.s, + 4: attribute.t, + 5: attribute.g, + 6: attribute.floats, + 7: attribute.ints, + 8: attribute.strings, + 9: attribute.tensors, + 10: attribute.graphs, + } + if attribute.type in attribute_mapping: + value = attribute_mapping[attribute.type] + else: # pragma: no cover + raise ValueError( + "attribute {} has no type specified " "or unsupported type {}.".format(attribute.name, attribute.type) + ) + return {attribute.name: value} + + +ONNX_INT_TYPE_RANGE = { + onnx.TensorProto.UINT8: (0, 255), + onnx.TensorProto.INT8: (-128, 127), +} + +ONNX_INT_TYPE_SYMMETRIC_RANGE = { + onnx.TensorProto.INT8: (-127, 127), +} + +ONNX_INT_TYPE_REDUCED_RANGE = { + onnx.TensorProto.UINT8: (0, 127), + onnx.TensorProto.INT8: (-64, 64), +} + +ONNX_STR_TYPE_RANGE = { + "int1": (-1, 0), + "int2": (-2, 1), + "int3": (-4, 3), + "int4": (-8, 7), # onnx >= 1.16.0 defines TensorProto.INT4 + "int5": (-16, 15), + "int6": (-32, 31), + "int7": (-64, 63), + "int8": (-128, 127), + "uint1": (0, 1), + "uint2": (0, 3), + "uint3": (0, 7), + "uint4": (0, 15), # onnx >= 1.16.0 defines TensorProto.UINT4 + "uint5": (0, 31), + "uint6": (0, 63), + "uint7": (0, 127), + "uint8": (0, 255), +} + + +def _qType_to_np_type(qType): + if isinstance(qType, int): + return onnx.helper.tensor_dtype_to_np_dtype(qType) + elif isinstance(qType, str) and "uint" in qType: + return np.dtype("uint8") + else: + return np.dtype("int8") + + +def find_by_name(name, item_list): + """Helper function to find item by name in a list.""" + items = [] + for item in item_list: + assert hasattr(item, "name"), "{} should have a 'name' attribute defined".format(item) + if item.name == name: + items.append(item) + if len(items) > 0: + return items[0] + else: + return None + + +def get_qmin_qmax_for_qType(qType, reduce_range=False, sym=False): # noqa: N802 + """Get qmin, qmax for qType. + + Args: + qType (int or str): int for onnx defined type, str for onnx not defined type + reduce_range (bool, optional): whether use 7 bit for 8bit quantization + sym (bool, optional): quantization scheme. Defaults to False. + """ + if qType == onnx.TensorProto.FLOAT8E4M3FN: + raise NotImplementedError("This function is not implemented for float 8 as not needed.") + + qrange = None + + if isinstance(qType, str): + qrange = ONNX_STR_TYPE_RANGE.get(qType) + elif reduce_range: + qrange = ONNX_INT_TYPE_REDUCED_RANGE.get(qType) + elif sym and qType in ONNX_INT_TYPE_SYMMETRIC_RANGE: + qrange = ONNX_INT_TYPE_SYMMETRIC_RANGE[qType] + else: + qrange = ONNX_INT_TYPE_RANGE.get(qType) + + if not qrange: + raise ValueError(f"Unexpected data type {qType} requested.") + + return qrange + + +def quantize_nparray(dtype, arr, scale, zero_point, low=None, high=None): + """Quantize numpy array.""" + q_weight = np.empty_like(np.asarray(arr), dtype=np.asarray(scale).dtype) + np.divide(arr, scale, out=q_weight) + np.add(q_weight, zero_point, out=q_weight) + np.round(q_weight, out=q_weight) + if low is not None and high is not None: + np.clip(q_weight, low, high, out=q_weight) + return q_weight.astype(dtype) + + +def quantize_data_per_channel(data, axis, qType, sym, reduce_range=False): + """Quantize tensor per-channel.""" + quantize_range = get_qmin_qmax_for_qType(qType, reduce_range, sym) + rmin = None + rmax = None + for i in range(len(data.shape)): + if i != axis: + rmin = np.min(data, axis=i, keepdims=True) if rmin is None else np.min(rmin, axis=i, keepdims=True) + rmax = np.max(data, axis=i, keepdims=True) if rmax is None else np.max(rmax, axis=i, keepdims=True) + rmin = np.minimum(rmin, 0) + rmax = np.maximum(rmax, 0) + scale, zero_point = calculate_scale_zp(rmin, rmax, qType, sym, reduce_range) + + dtype = _qType_to_np_type(qType) + quantized_data = quantize_nparray(dtype, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1]) + return rmin.reshape(-1, 1), rmax.reshape(-1, 1), zero_point.reshape(-1, 1), scale.reshape(-1, 1), quantized_data + + +def dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value): # pragma: no cover + """Dequantize tensor with scale and zero point.""" + return (tensor_value.astype(scale_value.dtype) - zo_value.astype(scale_value.dtype)) * scale_value + + +def dequantize_data(tensor_value, scale_value, zo_value, axis=0): # pragma: no cover + """Dequantize tensor.""" + if not isinstance(scale_value, np.ndarray): + return dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value) + else: + channel_count = tensor_value.shape[axis] # TBD, default from axis 0 + new_per_channel_tensor_values = [] + for i in range(channel_count): + per_channel_tensor_value = tensor_value.take(i, axis) + per_channel_scale_value = scale_value.take(i) + per_channel_zero_value = zo_value.take(i) + new_per_channel_tensor_values.append( + dequantize_data_with_scale_zero( + per_channel_tensor_value, per_channel_scale_value, per_channel_zero_value + ) + ) + # combine per_channel_data into one + reshape_dims = list(tensor_value.shape) # deep copy + reshape_dims[axis] = 1 # only one per channel for reshape + new_tensor_value = new_per_channel_tensor_values[0].reshape(reshape_dims) + for i in range(1, channel_count): + new_per_channel_tensor_value = new_per_channel_tensor_values[i].reshape(reshape_dims) + new_tensor_value = np.concatenate((new_tensor_value, new_per_channel_tensor_value), axis) + return new_tensor_value + + +def calculate_scale_zp(rmin, rmax, qType, sym, reduce_range=False): + """Calculate scale and zero point.""" + qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, sym) + dtype = _qType_to_np_type(qType) + if isinstance(rmax, np.ndarray): + if sym: + max_range = np.maximum(abs(rmin), abs(rmax)) + rmin = -max_range + rmax = max_range + scale = (rmax - rmin) / (qmax - qmin) + scale[scale < np.finfo(rmax.dtype).tiny] = 1 + zero_point = ( + np.multiply(np.ones(rmax.shape), np.round((qmax + qmin) / 2.0)).astype(dtype) + if sym + else np.round(qmin - rmin / scale).astype(dtype) + ) + else: + if sym: + max_range = max(abs(rmin), abs(rmax)) + scale = (float(max_range) * 2) / (qmax - qmin) if max_range > 0 else 1 + else: + scale = (float(rmax) - float(rmin)) / (qmax - qmin) if rmin != rmax else 1 + zero_point = np.round((qmax + qmin) / 2.0).astype(dtype) if sym else np.round(qmin - rmin / scale).astype(dtype) + return np.float32(scale), zero_point + + +def quantize_data(data, qType, sym, reduce_range=False, ratio=1.0, axis=None): + """Quantize data. + + To pack weights, we compute a linear transformation + - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and + - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where + m = max(abs(rmin), abs(rmax)) + and add necessary intermediate nodes to transform quantized weight to full weight + using the equation r = S(q-z), where + r: real original value + q: quantized value + S: scale + z: zero point + + Args: + data (array): data to quantize + qType (int): data type to quantize to. Supported types UINT8 and INT8 + sym (bool): whether use sym quantization. + reduce_range (bool): whether use 7 bit or not. Defaults to False + ratio (float, optional): percentile of clip. Defaults to 1.0 + axis (int, optional): process data along a specific axis. Default is None (process the whole data) + """ + quantize_range = get_qmin_qmax_for_qType(qType, reduce_range, sym) + rmin = np.min(np.min(data), 0) if axis is None else np.min(data, axis=1, keepdims=True) + rmax = np.max(np.max(data), 0) if axis is None else np.max(data, axis=1, keepdims=True) + rmin *= ratio + rmax *= ratio + + scale, zero_point = calculate_scale_zp(rmin, rmax, qType, sym, reduce_range) + dtype = _qType_to_np_type(qType) + quantized_data = quantize_nparray(dtype, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1]) + return rmin, rmax, zero_point, scale, quantized_data + + +def qdq_data(data, qType, sym, reduce_range=False, ratio=1.0, axis=None): + _, _, zero_point, scale, quantized_data = quantize_data(data, qType, sym, reduce_range, ratio, axis) + return scale * (quantized_data - zero_point) + + +def is_B_transposed(node): + """Whether inuput B is transposed.""" + transB = [attr for attr in node.attribute if attr.name == "transB"] + if len(transB): + return 0 < onnx.helper.get_attribute_value(transB[0]) + return False + + +def is_quantizable_type(data_type): + return data_type in [onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16, onnx.TensorProto.BFLOAT16] + + +def _get_blob_size(group_size, has_zp): # pragma: no cover + """Get blob_size. + + Args: + group_size (int): how many elements share one scale/zp + has_zp (bool): whether zero_point is None + """ + if version.Version(ort.__version__) > constants.ONNXRT1161_VERSION: + blob_size = group_size // 2 + elif has_zp: + blob_size = group_size // 2 + 4 + 1 + else: + blob_size = group_size // 2 + 4 + return blob_size + + +def make_matmul_weight_only_node( + node: onnx.NodeProto, + weight_shape: tuple, + num_bits: int, + group_size: int, + k_blocks: int, + q_weight: np.array, + scale: np.array, + zero_point: np.array, + accuracy_level: int = 0, +): + """Build MatMulFpQ4/MatMulNBits node. + + Args: + node (onnx.NodeProto): original matmul node + weight_shape (tuple): original weight shape + num_bits (int): number of bits used to represent weights. + group_size (int): how many elements share one scale/zp + k_blocks (int): block number + q_weight (np.array): quantized weight + scale (np.array): scale + zero_point (np.array): zero point + accuracy_level (int, optional): accuracy level. + Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel) Defaults to 0. + + Returns: + matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node + new_inits: initializers of the new node + """ + blob_size = _get_blob_size(group_size, zero_point is not None) + packed = np.zeros((q_weight.shape[0], blob_size), dtype="uint8") + q_weight_name = node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)) + input_names = [node.input[0], q_weight_name] + new_inits = [] + kwargs = {} + + if version.Version(ort.__version__) > constants.ONNXRT1161_VERSION: + op_type = "MatMulNBits" + + # pack quantized weight + q_weight_pairs = q_weight[:, ::2] | q_weight[:, 1::2] << 4 + packed[:, :] = q_weight_pairs[:, :blob_size] + packed = np.reshape(packed, (-1, k_blocks, blob_size)) + + # build scale tensor + scale = np.reshape(scale, (-1, k_blocks)) + scale_tensor = onnx.helper.make_tensor( + name=node.input[1] + "_scale", + data_type=onnx.helper.np_dtype_to_tensor_dtype(scale.dtype), + dims=scale.shape, + vals=scale.tobytes(), + raw=True, + ) + input_names.append(scale_tensor.name) + new_inits.append(scale_tensor) + + # build zero_point tensor + if zero_point is not None: + if num_bits > 4: + packed_zp = np.reshape(zero_point, (1, -1)).astype("uint8") + else: + packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8") + # create an index array + idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1) + # separate odd and even indices + even_idx = idx[::2] + odd_idx = idx[1::2] + # vectorized operation for even and odd indices + packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel() + packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4) + + zp_tensor = onnx.helper.make_tensor( + name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True + ) + input_names.append(zp_tensor.name) + new_inits.append(zp_tensor) + + # set kwargs + kwargs["K"] = weight_shape[0] + kwargs["N"] = weight_shape[1] + kwargs["bits"] = num_bits + kwargs["block_size"] = group_size + if accuracy_level > 0: + # require onnxruntime > 1.16.3 + kwargs["accuracy_level"] = accuracy_level + + else: + offset = 5 if zero_point is not None else 4 + op_type = "MatMulFpQ4" + + # pack quantized weight + for i in range(q_weight.shape[0]): + bf = struct.pack("f", scale[i]) + packed[i][0] = bf[0] + packed[i][1] = bf[1] + packed[i][2] = bf[2] + packed[i][3] = bf[3] + + if zero_point is not None: + packed[i][4] = zero_point[i] + + packed[i][offset:] = np.bitwise_or( + q_weight[i][: group_size // 2], np.left_shift(q_weight[i][group_size // 2 :], num_bits) + ) + packed = packed.reshape(-1) + + # build shape tensor + shape_tensor = onnx.helper.make_tensor( + name=node.input[1] + "_shape", data_type=7, dims=(2,), vals=np.array(weight_shape, dtype="int64") + ) + new_inits.append(shape_tensor) + input_names.append(shape_tensor.name) + + # set kwargs + kwargs["blk_quant_type"] = 1 if zero_point is not None else 0 + + q_weight_tensor = onnx.helper.make_tensor( + name=q_weight_name, + data_type=2, + dims=packed.shape, + vals=packed.tobytes(), + raw=True, + ) + new_inits.append(q_weight_tensor) + + matmul_weight_only_node = onnx.helper.make_node( + op_type, + inputs=input_names, + outputs=node.output, + name=node.name + "_Q" + str(num_bits) if node.name else "_Q" + str(num_bits), + domain="com.microsoft", + **kwargs, + ) + return matmul_weight_only_node, new_inits + + +def prepare_inputs(model, data_reader, providers): + """Prepare inputs for weight only quantization. + + Args: + model (ModelProto or onnx_model.ONNXModel): onnx model. + data_reader (CalibrationDataReader): a calibration data reader. + providers (list): providers to use. + + Returns: + inputs: prepared inputs. + so: session options + """ + + so = ort.SessionOptions() + if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): # pragma: no cover + so.register_custom_ops_library(onnxruntime_extensions.get_library_path()) + if model.is_large_model: + onnx.save_model( + model.model, + model.model_path + "_augment.onnx", + save_as_external_data=True, + all_tensors_to_one_file=True, + convert_attribute=False, + ) + + inputs_list = [] + while True: + inputs = data_reader.get_next() + if not inputs: + break + inputs_list.append(inputs) + return inputs_list, so + + +def pad_tensor(weight, group_size, k_blocks): + """Pad tensor rowi so that it can be is divisible by group_size. + + Args: + weight (array): weight + group_size (int): how many elements share one scale/zp + k_blocks (int): the number of block + + Returns: + weight: paded weight + """ + if group_size == -1: + return weight + + org_w_shape = weight.shape + padded_rows = k_blocks * group_size + pad_len = padded_rows - org_w_shape[0] + + if pad_len > 0: + weight = np.pad(weight, ((0, pad_len), (0, 0)), "constant") + + return weight + + +def dump_woq_stats(model, quantize_config): + res = {} + + dtype_set = set() + for node in model.graph.node: + if node.name.split("_Q")[0] not in quantize_config: + continue + if node.op_type in ["MatMulFpQ4", "MatMulNBits"]: + optype = "MatMul" + else: + optype = node.op_type + + if optype not in res: + res[optype] = {} + if re.fullmatch("^.*_Q\d*G\d*", node.input[1]): + search_out = re.search("_Q\d*", node.input[1]) + dtype = "A32W{}G{}".format( + node.input[1][search_out.start() + 2 : search_out.end()], node.input[1][search_out.end() + 1 :] + ) + else: + dtype = "FP32" + dtype_set.add(dtype) + + if dtype in res[optype]: + res[optype][dtype] += 1 + else: + res[optype][dtype] = 1 + + dtype_list = list(dtype_set) + for dtype in dtype_list: + for optype in res.keys(): + if dtype not in res[optype]: + res[optype][dtype] = 0 + + # update stats format for dump. + field_names = ["Op Type", "Total"] + field_names.extend(dtype_list) + output_data = [] + for op_type in res.keys(): + field_results = [op_type, sum(res[op_type].values())] + field_results.extend([res[op_type][dtype] for dtype in dtype_list]) + output_data.append(field_results) + + utility.Statistics(output_data, header="Mixed Precision Statistics", field_names=field_names).print_stat() + + +def get_node_original_name(node) -> str: + """Get the original name of the given node.""" + node_name: str = node.name + # TODO how to handle the unquantized node that has the `_quant` suffix, such as `conv_quant`? + if node_name.endswith(QUANT_OP_NAME_SUFFIX): + return node_name[: -len(QUANT_OP_NAME_SUFFIX)] + else: + # For unquantized nodes + return node_name + + +def split_shared_bias(model): + """Split shared tensor.""" + input_name_to_nodes = model.input_name_to_nodes() + for input_name, node_list in input_name_to_nodes.items(): + if len(node_list) > 1 and input_name in [i.name for i in model.model.graph.initializer]: + for node in node_list[1:]: + if node.op_type not in ["Conv", "FusedConv"]: + continue + if len(node.input) > 2 and node.input[2] == input_name: + new_input_name = node.input[2] + "_nc_split_" + node.name + new_input = onnx.helper.make_tensor( + new_input_name, + model.get_initializer(input_name).data_type, + model.get_initializer(input_name).dims, + model.get_initializer(input_name).raw_data, + True, + ) + model.add_initializer(new_input) + node.input[2] = new_input_name + return model + + +def remove_init_from_model_input(model): + """Remove initializer from model input.""" + inputs = model.model.graph.input + name_to_input = {} + for inp in inputs: + name_to_input[inp.name] = inp + for initializer in model.model.graph.initializer: + if initializer.name in name_to_input: + inputs.remove(name_to_input[initializer.name]) + + +class QuantizedValue: + """Represents a linearly quantized value (input/output/initializer).""" + + def __init__( + self, + name, + new_quantized_name, + scale_name, + zero_point_name, + axis=None, + qType=1, + ): + """Initialization. + + Args: + name (string): tensor name + new_quantized_name (string): quantized tensor name + scale_name (string): scale name + zero_point_name (string): zero point name + axis (int, optional): quantized axis. Defaults to None. + qType (int, optional): quantized data type. Defaults to 1 (uint8). + """ + self.name = name + self.q_name = new_quantized_name + self.scale_name = scale_name + self.zp_name = zero_point_name + self.axis = axis + self.qType = qType + + +class QuantizedInitializer: + """Represents a linearly quantized weight input from ONNX operators.""" + + def __init__( + self, + name, + initializer, + rmins, + rmaxs, + zero_points, + scales, + data=[], + quantized_data=[], + axis=None, + qType=1, + ): + """Initialization. + + Args: + name (string): initializer name + initializer (onnx.onnx_ml_pb2.TensorProto): initializer + rmins (list): list of min value + rmaxs (list): list of max value + zero_points (list): list of zero point + scales (list): list of scale + data (list, optional): array version of the initializer. Defaults to []. + quantized_data (list, optional): quantized data. Defaults to []. + axis (int, optional): quantized axis. Defaults to None. + qType (int, optional): quantized data type. Defaults to 1 (uint8). + """ + self.name = name + self.initializer = initializer # TensorProto initializer in ONNX graph + self.rmins = rmins # List of minimum range for each axis + self.rmaxs = rmaxs # List of maximum range for each axis + # 1D tensor of zero points computed for each axis. scalar if axis is empty + self.zero_points = zero_points + self.scales = scales # 1D tensor of scales computed for each axis. scalar if axis is empty + self.data = data # original data from initializer TensorProto + self.quantized_data = quantized_data # weight-packed data from data + # Scalar to specify which dimension in the initializer to weight pack. + self.axis = axis + # If empty, single zero point and scales computed from a single rmin and rmax + self.qType = qType + + +def dump_model_op_stats(model, quantize_config, fp32_op_list): + qdq_ops = ["QuantizeLinear", "DequantizeLinear", "DynamicQuantizeLinear"] + res = {} + for op_type in fp32_op_list: + res[op_type] = {"INT8": 0, "FP32": 0} + for op_type in qdq_ops: + res[op_type] = {"INT8": 0, "FP32": 0} + + for node in model.graph.node: + if node.name.endswith("_quant"): + if node.op_type.startswith("QLinear"): + origin_op_type = node.op_type.split("QLinear")[-1] + else: + origin_op_type = node.op_type.split("Integer")[0] + + if origin_op_type in ["QAttention", "QGemm"]: + origin_op_type = origin_op_type[1:] + elif origin_op_type == "DynamicQuantizeLSTM": + origin_op_type = "LSTM" + elif origin_op_type == "QEmbedLayerNormalization": + origin_op_type = "EmbedLayerNormalization" + res[origin_op_type]["INT8"] += 1 + + elif node.op_type in qdq_ops: + res[node.op_type]["INT8"] += 1 + + elif node.op_type in res: + res[node.op_type]["FP32"] += 1 + + field_names = ["Op Type", "Total", "INT8", "FP32"] + output_data = [ + [ + op_type, + sum(res[op_type].values()), + res[op_type]["INT8"], + res[op_type]["FP32"], + ] + for op_type in res.keys() + ] + + utility.Statistics(output_data, header="Quantization Statistics", field_names=field_names).print_stat() diff --git a/onnx_neural_compressor/algorithms/weight_only/awq.py b/onnx_neural_compressor/algorithms/weight_only/awq.py index 30d9e8442..9e07b45a6 100644 --- a/onnx_neural_compressor/algorithms/weight_only/awq.py +++ b/onnx_neural_compressor/algorithms/weight_only/awq.py @@ -24,9 +24,9 @@ import onnxruntime as ort from packaging import version -from onnx_neural_compressor import config, constants, data_reader, logger, onnx_model, utility +from onnx_neural_compressor import constants, data_reader, logger, onnx_model +from onnx_neural_compressor.algorithms import utility as quant_utils from onnx_neural_compressor.algorithms.weight_only import rtn -from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility from typing import List, Union # isort: skip @@ -39,7 +39,7 @@ def _get_weight_scale(weight, group_size): return scale -def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, scheme): +def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts): """Apply scale for salient weight.""" best_scales = {} new_init_tensors = [] @@ -48,6 +48,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, updated_nodes = [] base_dir = os.path.dirname(model.model_path) if model.model_path is not None else "" + input_name_to_nodes = model.input_name_to_nodes() for parent, nodes in absorb_pairs.items(): if any([node.input[0] not in output_dicts for node in nodes]): logger.warning( @@ -61,14 +62,17 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, dtype = None weight = [] org_out = [] + + num_bits = weight_config[nodes[0].name].get("weight_bits", 4) + group_size = weight_config[nodes[0].name].get("weight_group_size", 32) + sym = weight_config[nodes[0].name].get("weight_sym", True) + accuracy_level = weight_config[nodes[0].name].get("accuracy_level", 0) + + # use same params for all children of one parent for node in nodes: - if (node.name, node.op_type) in weight_config and weight_config.get( - (node.name, node.op_type), "fp32" - ) != "fp32": - num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4) - group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32) - scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym" - break + weight_config.setdefault(node.name, {}).update({"weight_bits": num_bits}) + weight_config.setdefault(node.name, {}).update({"weight_group_size": group_size}) + weight_config.setdefault(node.name, {}).update({"weight_sym": sym}) # search scale best_error = float("inf") @@ -80,9 +84,6 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, ratio = ratio * 1 / n_grid loss = 0 for node in nodes: - if weight_config.get((node.name, node.op_type), {}) == "fp32": - continue - weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir) if len(weight.shape) != 2: continue @@ -95,7 +96,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, scales = np.clip(np.power(inp_scale, ratio) / np.power(w_scale, (1 - ratio)), 1e-4, None) scales = scales / np.sqrt(np.max(scales) * np.min(scales)) weight = weight.T * scales - weight = woq_utility.pad_tensor(weight, group_size, (org_w_shape[0] + group_size - 1) // group_size).T + weight = quant_utils.pad_tensor(weight.T, group_size, (org_w_shape[0] + group_size - 1) // group_size) if (version.Version(ort.__version__) > constants.ONNXRT1161_VERSION and num_bits == 4) or ( version.Version(ort.__version__) >= constants.ONNXRT116_VERSION @@ -104,16 +105,20 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, ): # pragma: no cover # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1 - q_weight = woq_utility.qdq_tensor(weight, num_bits, group_size, scheme, "uint") / np.expand_dims( - scales, axis=-1 - ) + q_weight = quant_utils.qdq_data( + weight.reshape((-1, group_size)), + "uint" + str(num_bits), + sym, + ).reshape(weight.shape) else: - q_weight = woq_utility.qdq_tensor(weight, num_bits, group_size, scheme, "int") / np.expand_dims( - scales, axis=-1 - ) - - q_weight = np.reshape(q_weight, (org_w_shape[1], -1))[:, : org_w_shape[0]] - out = np.matmul(inp, q_weight.T) + q_weight = quant_utils.qdq_data( + weight.reshape((-1, group_size)), + "int" + str(num_bits), + sym, + ).reshape(weight.shape) + + q_weight = q_weight[: org_w_shape[0], :] / np.expand_dims(scales, axis=-1) + out = np.matmul(inp, q_weight) loss += np.mean(np.power((org_out - out), 2)) is_best = loss < best_error @@ -123,10 +128,6 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, best_scale = scales for node in nodes: - weight_config.setdefault((node.name, node.op_type), {}).update({"weight_bits": num_bits}) - weight_config.setdefault((node.name, node.op_type), {}).update({"weight_group_size": group_size}) - weight_config.setdefault((node.name, node.op_type), {}).update({"weight_sym": scheme == "sym"}) - init_share_num = model.get_initializer_share_num(node.input[1]) weight_tensor = model.get_initializer(node.input[1]) tensor = onnx.numpy_helper.to_array(weight_tensor, base_dir) @@ -136,7 +137,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, new_tensor = onnx.helper.make_tensor( name=node.input[1] + "_scaled", - data_type=utility.dtype_mapping[str(dtype)], + data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype), dims=tensor.shape, vals=tensor.tobytes(), raw=True, @@ -152,7 +153,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, continue if parent.op_type in ["LayerNormalization", "BatchNormalization", "InstanceNormalization"] and len( - model.input_name_to_nodes()[nodes[0].input[0]] + input_name_to_nodes[nodes[0].input[0]] ) == len(nodes): for idx in [1, 2]: tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[idx]), base_dir) @@ -165,7 +166,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, elif ( parent.op_type in ["SimplifiedLayerNormalization", "MatMul", "Gemm", "Mul"] and not all([model.get_initializer(inp) is None for inp in parent.input]) - and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len(nodes) + and len(input_name_to_nodes[nodes[0].input[0]]) == len(nodes) ): # pragma: no cover for inp in parent.input: if model.get_initializer(inp) is not None: @@ -176,7 +177,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, updated_nodes.append(parent.name) output_dicts[parent.output[0]] = output_dicts[parent.output[0]] / np.reshape(best_scale, (1, -1)) - elif parent.op_type in ["Conv", "FusedConv"] and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len( + elif parent.op_type in ["Conv", "FusedConv"] and len(input_name_to_nodes[nodes[0].input[0]]) == len( nodes ): # pragma: no cover tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[2]), base_dir) @@ -190,7 +191,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, # insert mul scale_tensor = onnx.helper.make_tensor( name=parent.output[0] + "_weight_only_scale", - data_type=utility.dtype_mapping[str(dtype)], + data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype), dims=best_scale.shape, vals=(1.0 / best_scale).flatten().tolist(), ) @@ -216,7 +217,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, return model, output_dicts -def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, scheme): +def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts): """Apply clip for weight by checking mse.""" base_dir = os.path.dirname(model.model_path) if model.model_path is not None else "" ratios = {} @@ -232,18 +233,17 @@ def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, inp = np.concatenate(output_dicts[nodes[0].input[0]], axis=0) for node in nodes: - if (node.name, node.op_type) in weight_config: - num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4) - group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32) - scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym" + num_bits = weight_config[node.name].get("weight_bits", 4) + group_size = weight_config[node.name].get("weight_group_size", 32) + sym = weight_config[node.name].get("weight_sym", True) + accuracy_level = weight_config[node.name].get("accuracy_level", 0) org_weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir=base_dir) org_w_shape = org_weight.shape # ic, oc group_size = group_size if group_size != -1 else org_w_shape[0] org_out = np.matmul(inp, org_weight) # n_token, oc - k_blocks = (org_w_shape[0] - 1) // group_size + 1 - org_weight = woq_utility.pad_tensor(org_weight, group_size, k_blocks) + org_weight = quant_utils.pad_tensor(org_weight, group_size, k_blocks) org_weight = np.transpose(org_weight) @@ -259,15 +259,21 @@ def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, ): # pragma: no cover # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1 - weight = woq_utility.qdq_tensor( - weight, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1) - ) + weight = quant_utils.qdq_data( + weight.reshape((-1, group_size)), + "uint" + str(num_bits), + sym, + ratio=ratio, + ).reshape(org_weight.shape) else: - weight = woq_utility.qdq_tensor( - weight, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1) - ) - weight = np.reshape(weight, (org_w_shape[1], -1))[:, : org_w_shape[0]] - cur_out = np.matmul(inp, weight.T) + weight = quant_utils.qdq_data( + weight.reshape((-1, group_size)), + "int" + str(num_bits), + sym, + ratio=ratio, + ).reshape(org_weight.shape) + + cur_out = np.matmul(inp, weight[:, : org_w_shape[0]].T) loss = np.mean(np.power((org_out - cur_out), 2)) is_best = loss < best_error if is_best: @@ -281,12 +287,8 @@ def awq_quantize( model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], data_reader: data_reader.CalibrationDataReader, weight_config: dict = {}, - num_bits: int = 4, - group_size: int = 32, - scheme: str = "asym", enable_auto_scale: bool = True, enable_mse_search: bool = True, - accuracy_level: int = 0, providers: List[str] = ["CPUExecutionProvider"], ) -> onnx.ModelProto: """Quant the model with Activation-aware Weight quantization(AWQ) method. @@ -306,16 +308,10 @@ def awq_quantize( 'accuracy_level': 0 } }. Defaults to {}. - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): size of weight groups. Defaults to 32. - scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym". enable_auto_scale (bool, optional): whether to search for best scales based on activation distribution. Defaults to True. enable_mse_search (bool, optional): whether to search for the best clip range from range [0.91, 1.0, 0.01]. Defaults to True. - accuracy_level (int, optional): accuracy level. Support 0 (unset), - 1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel), - 3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0. providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"]. Returns: @@ -327,7 +323,7 @@ def awq_quantize( full_ratio = {} if enable_mse_search: - inputs, so = woq_utility.prepare_inputs(model, data_reader, providers) + inputs, so = quant_utils.prepare_inputs(model, data_reader, providers) del data_reader org_output = copy.deepcopy(model.model.graph.output) @@ -341,7 +337,7 @@ def awq_quantize( if ( node.op_type in ["MatMul"] and model.get_initializer(node.input[1]) is not None - and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32" + and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32" ): output_names.append(node.input[0]) output_names = list(set(output_names)) @@ -361,18 +357,20 @@ def awq_quantize( else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers) ) + output_name_to_node = model.output_name_to_node() + input_name_to_nodes = model.input_name_to_nodes() for input_name in output_names: - parent = model.output_name_to_node()[input_name] + parent = output_name_to_node[input_name] dump_pairs = {parent.name: []} - for node in model.input_name_to_nodes()[input_name]: + for node in input_name_to_nodes[input_name]: # check op_type of node is MatMul # check dim 1 of input is weight tensor # check weight_type is not "fp32" if ( node.op_type in ["MatMul"] and model.get_initializer(node.input[1]) is not None - and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32" + and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32" ): dump_pairs[parent.name].append(model.get_node(node.name)) @@ -390,9 +388,6 @@ def awq_quantize( weight_config, dump_pairs, output_dicts, - num_bits, - group_size, - scheme, ) if enable_mse_search: ratios = _apply_awq_clip( @@ -400,9 +395,6 @@ def awq_quantize( weight_config, dump_pairs, output_dicts, - num_bits, - group_size, - scheme, ) del output_dicts del dump_pairs @@ -410,7 +402,7 @@ def awq_quantize( model.remove_tensors_from_outputs(output_names) model.model.graph.output.MergeFrom(org_output) - model = rtn.rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio, accuracy_level, providers) + model = rtn.rtn_quantize(model, weight_config, full_ratio, providers) return model @@ -418,6 +410,9 @@ def apply_awq_on_model( model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], quant_config: dict, calibration_data_reader: data_reader.CalibrationDataReader, + enable_auto_scale: bool = True, + enable_mse_search: bool = True, + providers: List[str] = ["CPUExecutionProvider"], ) -> onnx.ModelProto: """Apply Activation-aware Weight quantization(AWQ) on onnx model. @@ -430,12 +425,11 @@ def apply_awq_on_model( onnx.ModelProto: quantized onnx model. """ # set model params - kwargs = {} - kwargs = {key: quant_config.pop(key) for key in config.AWQConfig.model_params_list if key in quant_config} - - # change op config to dict type - for op_name_type, op_config in quant_config.items(): - if isinstance(op_config, config.AWQConfig): - quant_config[op_name_type] = op_config.to_dict() - - return awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs) + kwargs = { + "enable_auto_scale": enable_auto_scale, + "enable_mse_search": enable_mse_search, + "providers": providers, + } + q_model = awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs) + quant_utils.dump_woq_stats(q_model, quant_config) + return q_model diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py index 5016a2780..ae3813280 100644 --- a/onnx_neural_compressor/algorithms/weight_only/gptq.py +++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py @@ -24,9 +24,10 @@ import onnxruntime as ort from packaging.version import Version -from onnx_neural_compressor import config, constants, data_reader, onnx_model, utility +from onnx_neural_compressor import constants, data_reader, onnx_model, utility +from onnx_neural_compressor.algorithms import utility as quant_utils from onnx_neural_compressor.algorithms.layer_wise import core -from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility +from onnx_neural_compressor.quantization import config from typing import List, Union # isort: skip @@ -36,8 +37,8 @@ def _gptq( H: np.array, num_bits: int = 4, group_size: int = 32, - scheme: str = "asym", - blocksize: int = 128, + sym: bool = False, + block_size: int = 128, percdamp: float = 0.01, actorder: bool = False, mse: bool = False, @@ -50,8 +51,8 @@ def _gptq( H (np.array): Hessian matrix. num_bits (int, optional): num_bits. Default is 4. group_size (int, optional): how many elements share one scale/zp. Default is 32. - scheme (str, optional): sym or asym. Defaults to "asym". - blocksize (int, optional): blocksize to quantize weight. + sym (bool, optional): sym or asym. Defaults to False. + block_size (int, optional): block_size to quantize weight. percdamp (float, optional): percent of the average Hessian diagonal to use for dampening. actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value. mse (bool, optional): whether get scale and zero point with mse error. @@ -74,7 +75,7 @@ def find_params(weight): tmp = np.zeros(weight.shape[1]) xmin = np.minimum(np.min(weight, axis=0), tmp) xmax = np.maximum(np.max(weight, axis=0), tmp) - if scheme == "sym": + if sym: xmax = np.maximum(np.abs(xmin), xmax) tmp = xmin < 0 if np.any(tmp): @@ -84,7 +85,7 @@ def find_params(weight): xmax[tmp] = +1 scale = (xmax - xmin) / maxq - if scheme == "sym": + if sym: zero = np.ones(scale.shape) * (maxq + 1) / 2 else: zero = np.round(-xmin / scale) @@ -95,7 +96,7 @@ def find_params(weight): xmin1 = p * xmin xmax1 = p * xmax scale1 = (xmax1 - xmin1) / maxq - zero1 = np.round(-xmin1 / scale1) if scheme != "sym" else zero + zero1 = np.round(-xmin1 / scale1) if not sym else zero q = np.clip(np.round(weight / scale1) + zero1, 0, maxq) q -= weight q = np.power(np.abs(q), norm) @@ -134,8 +135,8 @@ def find_params(weight): H[diag, diag] += damp # add a average value of H = np.linalg.cholesky(np.linalg.inv(H)).T Hinv = H - for i1 in range(0, shape[0], blocksize): - i2 = min(i1 + blocksize, shape[0]) + for i1 in range(0, shape[0], block_size): + i2 = min(i1 + block_size, shape[0]) count = i2 - i1 W1 = copy.deepcopy(W[i1:i2, :]) @@ -178,15 +179,11 @@ def gptq_quantize( model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], data_reader: data_reader.CalibrationDataReader, weight_config: dict = {}, - num_bits: int = 4, - group_size: int = 32, - scheme: str = "asym", percdamp: float = 0.01, - blocksize: int = 128, + block_size: int = 128, actorder: bool = False, mse: bool = False, perchannel: bool = True, - accuracy_level: int = 0, providers: List[str] = ["CPUExecutionProvider"], return_modelproto: bool = True, ): @@ -206,19 +203,13 @@ def gptq_quantize( 'weight_sym': True, 'accuracy_level': 0 }. Defaults to {}. - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): size of weight groups. Defaults to 32. - scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym". percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added to Hessian's diagonal to increase numerical stability. Defaults to 0.01. - blocksize (int, optional): execute GPTQ quantization per block. Defaults to 128. + block_size (int, optional): execute GPTQ quantization per block. Defaults to 128. actorder (bool, optional): whether to sort Hessian's diagonal values to rearrange channel-wise quantization order. Defaults to False. mse (bool, optional): whether get scale and zero point with mse error. Defaults to False. perchannel (bool, optional): whether quantize weight per-channel. Defaults to True. - accuracy_level (int, optional): accuracy level. Support 0 (unset), - 1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel), - 3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0. providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"]. return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant. Default to True @@ -230,7 +221,7 @@ def gptq_quantize( model = onnx_model.ONNXModel(model) base_dir = os.path.dirname(model.model_path) if model.model_path is not None else "" - inputs, so = woq_utility.prepare_inputs(model, data_reader, providers) + inputs, so = quant_utils.prepare_inputs(model, data_reader, providers) del data_reader org_output = copy.deepcopy(model.model.graph.output) model.remove_tensors_from_outputs([i.name for i in org_output]) @@ -242,7 +233,7 @@ def gptq_quantize( if ( node.op_type in ["MatMul"] and model.get_initializer(node.input[1]) is not None - and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32" + and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32" ): output_names.append(node.input[0]) output_names = list(set(output_names)) @@ -262,19 +253,21 @@ def gptq_quantize( else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers) ) + input_name_to_nodes = model.input_name_to_nodes() + for idx, input_name in enumerate(output_names): utility.simple_progress_bar(len(output_names), idx + 1) node_list = [] weights = [] - for node in model.input_name_to_nodes()[input_name]: + for node in input_name_to_nodes[input_name]: # check op_type of node is MatMul # check dim 1 of input is weight tensor # check weight_type is not "fp32" if ( node.op_type in ["MatMul"] and model.get_initializer(node.input[1]) is not None - and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32" + and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32" ): weight = onnx.numpy_helper.to_array( model.get_initializer(model.get_node(node.name).input[1]), base_dir @@ -304,11 +297,10 @@ def gptq_quantize( weight, H, ) in zip(node_list, weights, Hs): - if (node.name, node.op_type) in weight_config: - num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4) - group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32) - scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym" - accuracy_level = weight_config[(node.name, node.op_type)].get("accuracy_level", 0) + num_bits = weight_config[node.name].get("weight_bits", 4) + group_size = weight_config[node.name].get("weight_group_size", 32) + sym = weight_config[node.name].get("weight_sym", True) + accuracy_level = weight_config[node.name].get("accuracy_level", 0) group_size = group_size if group_size != -1 else weight.shape[0] dtype = weight.dtype @@ -317,8 +309,8 @@ def gptq_quantize( H, num_bits=num_bits, group_size=group_size, - scheme=scheme, - blocksize=blocksize, + sym=sym, + block_size=block_size, percdamp=percdamp, actorder=actorder, mse=mse, @@ -340,10 +332,14 @@ def gptq_quantize( # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP org_shape = weight.shape k_blocks = (org_shape[0] + group_size - 1) // group_size - q_weight = woq_utility.pad_tensor(q_weight, group_size, k_blocks) - q_weight, scale, zp = woq_utility.quant_tensor(q_weight.T, num_bits, group_size, scheme, "uint") - - q_matmul_node, new_inits = woq_utility.make_matmul_weight_only_node( + q_weight = quant_utils.pad_tensor(q_weight, group_size, k_blocks) + _, _, zp, scale, q_weight = quant_utils.quantize_data( + q_weight.T, + "uint" + str(num_bits), + sym, + axis=1, + ) + q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node( node=node, weight_shape=org_shape, num_bits=num_bits, @@ -351,7 +347,7 @@ def gptq_quantize( k_blocks=k_blocks, q_weight=q_weight.astype("uint8"), scale=scale.astype(dtype), - zero_point=zp if scheme == "asym" else None, + zero_point=zp if not sym else None, accuracy_level=accuracy_level, ) @@ -361,7 +357,7 @@ def gptq_quantize( else: q_weight_tensor = onnx.helper.make_tensor( name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)), - data_type=utility.dtype_mapping[str(dtype)], + data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype), dims=q_weight.shape, vals=q_weight.astype(dtype).tobytes(), raw=True, @@ -391,6 +387,13 @@ def apply_gptq_on_model( model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], quant_config: dict, calibration_data_reader: data_reader.CalibrationDataReader, + percdamp: float = 0.01, + block_size: int = 128, + actorder: bool = False, + mse: bool = False, + perchannel: bool = True, + providers: List[str] = ["CPUExecutionProvider"], + layer_wise_quant: bool = False, ) -> onnx.ModelProto: """Apply GPTQ on onnx model. @@ -402,18 +405,17 @@ def apply_gptq_on_model( Returns: onnx.ModelProto: quantized onnx model. """ - # check whether to do layer_wise quant - layer_wise = quant_config.pop("layer_wise_quant", False) - # set other model params - quant_kwargs = {} - quant_kwargs = {key: quant_config.pop(key) for key in config.GPTQConfig.model_params_list if key in quant_config} - - # change op config to dict type - for op_name_type, op_config in quant_config.items(): - if isinstance(op_config, config.GPTQConfig): - quant_config[op_name_type] = op_config.to_dict() - if layer_wise: + quant_kwargs = { + "percdamp": percdamp, + "block_size": block_size, + "actorder": actorder, + "mse": mse, + "perchannel": perchannel, + "providers": providers, + } + + if layer_wise_quant: quantized_model = core.layer_wise_quant( model, quant_func=gptq_quantize, @@ -428,4 +430,5 @@ def apply_gptq_on_model( if isinstance(quantized_model, onnx_model.ONNXModel): quantized_model = quantized_model.model + quant_utils.dump_woq_stats(quantized_model, quant_config) return quantized_model diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py index 619c055e1..18fdc1e47 100644 --- a/onnx_neural_compressor/algorithms/weight_only/rtn.py +++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py @@ -1,10 +1,7 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# # Copyright (c) 2023 MIT HAN Lab # This source code is licensed under the MIT license # -# Copyright (c) 2023 Intel Corporation +# Copyright (c) 2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,9 +23,9 @@ import onnxruntime as ort from packaging import version -from onnx_neural_compressor import config, constants, onnx_model, utility +from onnx_neural_compressor import constants, onnx_model, utility +from onnx_neural_compressor.algorithms import utility as quant_utils from onnx_neural_compressor.algorithms.layer_wise import core -from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility from typing import List, Union # isort: skip @@ -36,11 +33,7 @@ def rtn_quantize( model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], weight_config: dict = {}, - num_bits: int = 4, - group_size: int = 32, - scheme: str = "asym", ratios: dict = {}, - accuracy_level: int = 0, providers: List[str] = ["CPUExecutionProvider"], return_modelproto: bool = True, ): @@ -60,14 +53,7 @@ def rtn_quantize( 'accuracy_level': 0 } }. Defaults to {}. - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): size of weight groups. Defaults to 32. - scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym". ratios (dict, optional): percentile of clip. Defaults to {}. - accuracy_level (int, optional): - accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), - 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), - 4 (int8 compute type of jblas kernel). Defaults to 0. providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"]. return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant. Default to True @@ -92,7 +78,7 @@ def rtn_quantize( if ( node.op_type in ["MatMul"] # check op_type of node is MatMul and model.get_initializer(node.input[1]) is not None - and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32" + and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32" ): weight_tensor = model.get_initializer(node.input[1]) weight = onnx.numpy_helper.to_array(weight_tensor, base_dir=base_dir).copy() @@ -100,11 +86,10 @@ def rtn_quantize( continue dtype = weight.dtype - if (node.name, node.op_type) in weight_config: - num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4) - group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32) - scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym" - accuracy_level = weight_config[(node.name, node.op_type)].get("accuracy_level", 0) + num_bits = weight_config[node.name].get("weight_bits", 4) + group_size = weight_config[node.name].get("weight_group_size", 32) + sym = weight_config[node.name].get("weight_sym", True) + accuracy_level = weight_config[node.name].get("accuracy_level", 0) org_w_shape = weight.shape # ic, oc group_size = group_size if group_size != -1 else org_w_shape[0] @@ -112,7 +97,7 @@ def rtn_quantize( k_blocks = (org_w_shape[0] - 1) // group_size + 1 init_share_num = model.get_initializer_share_num(node.input[1]) - weight = woq_utility.pad_tensor(weight, group_size, k_blocks) + weight = quant_utils.pad_tensor(weight, group_size, k_blocks) satisfy_MatMulNBits_condition = ( version.Version(ort.__version__) > constants.ONNXRT1161_VERSION and num_bits == 4 @@ -126,10 +111,14 @@ def rtn_quantize( ): # pragma: no cover # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP - q_weight, scale, zp = woq_utility.quant_tensor( - weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1) + _, _, zp, scale, q_weight = quant_utils.quantize_data( + weight.T.reshape((-1, group_size)), + "uint" + str(num_bits), + sym, + ratio=ratios.get(node.input[1], 1), + axis=1, ) - q_matmul_node, new_inits = woq_utility.make_matmul_weight_only_node( + q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node( node=node, weight_shape=org_w_shape, num_bits=num_bits, @@ -137,7 +126,7 @@ def rtn_quantize( k_blocks=k_blocks, q_weight=q_weight.astype("uint8"), scale=scale.astype(dtype), - zero_point=zp if scheme == "asym" else None, + zero_point=zp if not sym else None, accuracy_level=accuracy_level, ) @@ -145,15 +134,19 @@ def rtn_quantize( remove_nodes.append(node) new_nodes.append(q_matmul_node) else: - q_weight = woq_utility.qdq_tensor( - weight.T, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1) + q_weight = quant_utils.qdq_data( + weight.T.reshape((-1, group_size)), + "int" + str(num_bits), + sym, + ratio=ratios.get(node.input[1], 1), + axis=1, ) q_weight = np.reshape(q_weight, (org_w_shape[1], -1)) q_weight = np.transpose(q_weight) q_weight = q_weight[: org_w_shape[0], :].astype(dtype) q_weight_tensor = onnx.helper.make_tensor( name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)), - data_type=utility.dtype_mapping[str(dtype)], + data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype), dims=weight.shape, vals=q_weight.tobytes(), raw=True, @@ -178,7 +171,11 @@ def rtn_quantize( def apply_rtn_on_model( - model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], quant_config: dict + model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], + quant_config: dict, + ratios: dict = {}, + providers: List[str] = ["CPUExecutionProvider"], + layer_wise_quant: bool = False, ) -> onnx.ModelProto: """Apply RTN on onnx model. @@ -189,19 +186,12 @@ def apply_rtn_on_model( Returns: onnx.ModelProto: quantized onnx model. """ - # check whether to do layer_wise quant - layer_wise = quant_config.pop("layer_wise_quant", False) - - # set other model params - quant_kwargs = {} - quant_kwargs = {key: quant_config.pop(key) for key in config.RTNConfig.model_params_list if key in quant_config} - - # change op config to dict type - for op_name_type, op_config in quant_config.items(): - if isinstance(op_config, config.RTNConfig): - quant_config[op_name_type] = op_config.to_dict() + quant_kwargs = { + "ratios": ratios, + "providers": providers, + } - if layer_wise: + if layer_wise_quant: quantized_model = core.layer_wise_quant( model, quant_func=rtn_quantize, weight_config=quant_config, **quant_kwargs ) @@ -210,4 +200,5 @@ def apply_rtn_on_model( if isinstance(quantized_model, onnx_model.ONNXModel): quantized_model = quantized_model.model + quant_utils.dump_woq_stats(quantized_model, quant_config) return quantized_model diff --git a/onnx_neural_compressor/algorithms/weight_only/utility.py b/onnx_neural_compressor/algorithms/weight_only/utility.py deleted file mode 100644 index ddb5f990d..000000000 --- a/onnx_neural_compressor/algorithms/weight_only/utility.py +++ /dev/null @@ -1,332 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 MIT HAN Lab -# This source code is licensed under the MIT license -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import struct -import sys -from importlib import util - -import numpy as np -import onnx -import onnxruntime as ort -from packaging import version - -from onnx_neural_compressor import constants, utility - -if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): # pragma: no cover - import onnxruntime_extensions - - -def _get_blob_size(group_size, has_zp): # pragma: no cover - """Get blob_size. - - Args: - group_size (int): how many elements share one scale/zp - has_zp (bool): whether zero_point is None - """ - if version.Version(ort.__version__) > constants.ONNXRT1161_VERSION: - blob_size = group_size // 2 - elif has_zp: - blob_size = group_size // 2 + 4 + 1 - else: - blob_size = group_size // 2 + 4 - return blob_size - - -def make_matmul_weight_only_node( - node: onnx.NodeProto, - weight_shape: tuple, - num_bits: int, - group_size: int, - k_blocks: int, - q_weight: np.array, - scale: np.array, - zero_point: np.array, - accuracy_level: int = 0, -): - """Build MatMulFpQ4/MatMulNBits node. - - Args: - node (onnx.NodeProto): original matmul node - weight_shape (tuple): original weight shape - num_bits (int): number of bits used to represent weights. - group_size (int): how many elements share one scale/zp - k_blocks (int): block number - q_weight (np.array): quantized weight - scale (np.array): scale - zero_point (np.array): zero point - accuracy_level (int, optional): accuracy level. - Support 0 (unset), 1(fp32 compute type of jblas kernel), - 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), - 4 (int8 compute type of jblas kernel) Defaults to 0. - - Returns: - matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node - new_inits: initializers of the new node - """ - blob_size = _get_blob_size(group_size, zero_point is not None) - packed = np.zeros((q_weight.shape[0], blob_size), dtype="uint8") - q_weight_name = node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)) - input_names = [node.input[0], q_weight_name] - new_inits = [] - kwargs = {} - - if version.Version(ort.__version__) > constants.ONNXRT1161_VERSION: - op_type = "MatMulNBits" - - # pack quantized weight - q_weight_pairs = q_weight[:, ::2] | q_weight[:, 1::2] << 4 - packed[:, :] = q_weight_pairs[:, :blob_size] - packed = np.reshape(packed, (-1, k_blocks, blob_size)) - - # build scale tensor - scale = np.reshape(scale, (-1, k_blocks)) - scale_tensor = onnx.helper.make_tensor( - name=node.input[1] + "_scale", - data_type=utility.dtype_mapping[str(scale.dtype)], - dims=scale.shape, - vals=scale.tobytes(), - raw=True, - ) - input_names.append(scale_tensor.name) - new_inits.append(scale_tensor) - - # build zero_point tensor - if zero_point is not None: - if num_bits > 4: - packed_zp = np.reshape(zero_point, (1, -1)).astype("uint8") - else: - packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8") - # create an index array - idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1) - # separate odd and even indices - even_idx = idx[::2] - odd_idx = idx[1::2] - # vectorized operation for even and odd indices - packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel() - packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4) - - zp_tensor = onnx.helper.make_tensor( - name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True - ) - input_names.append(zp_tensor.name) - new_inits.append(zp_tensor) - - # set kwargs - kwargs["K"] = weight_shape[0] - kwargs["N"] = weight_shape[1] - kwargs["bits"] = num_bits - kwargs["block_size"] = group_size - if accuracy_level > 0: - # require onnxruntime > 1.16.3 - kwargs["accuracy_level"] = accuracy_level - - else: - offset = 5 if zero_point is not None else 4 - op_type = "MatMulFpQ4" - - # pack quantized weight - for i in range(q_weight.shape[0]): - bf = struct.pack("f", scale[i]) - packed[i][0] = bf[0] - packed[i][1] = bf[1] - packed[i][2] = bf[2] - packed[i][3] = bf[3] - - if zero_point is not None: - packed[i][4] = zero_point[i] - - packed[i][offset:] = np.bitwise_or( - q_weight[i][: group_size // 2], np.left_shift(q_weight[i][group_size // 2 :], num_bits) - ) - packed = packed.reshape(-1) - - # build shape tensor - shape_tensor = onnx.helper.make_tensor( - name=node.input[1] + "_shape", data_type=7, dims=(2,), vals=np.array(weight_shape, dtype="int64") - ) - new_inits.append(shape_tensor) - input_names.append(shape_tensor.name) - - # set kwargs - kwargs["blk_quant_type"] = 1 if zero_point is not None else 0 - - q_weight_tensor = onnx.helper.make_tensor( - name=q_weight_name, - data_type=2, - dims=packed.shape, - vals=packed.tobytes(), - raw=True, - ) - new_inits.append(q_weight_tensor) - - matmul_weight_only_node = onnx.helper.make_node( - op_type, - inputs=input_names, - outputs=node.output, - name=node.name + "_Q" + str(num_bits) if node.name else "_Q" + str(num_bits), - domain="com.microsoft", - **kwargs, - ) - return matmul_weight_only_node, new_inits - - -def prepare_inputs(model, data_reader, providers): - """Prepare inputs for weight only quantization. - - Args: - model (ModelProto or onnx_model.ONNXModel): onnx model. - data_reader (CalibrationDataReader): a calibration data reader. - providers (list): providers to use. - - Returns: - inputs: prepared inputs. - so: session options - """ - - so = ort.SessionOptions() - if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): # pragma: no cover - so.register_custom_ops_library(onnxruntime_extensions.get_library_path()) - if model.is_large_model: - onnx.save_model( - model.model, - model.model_path + "_augment.onnx", - save_as_external_data=True, - all_tensors_to_one_file=True, - convert_attribute=False, - ) - - inputs_list = [] - while True: - inputs = data_reader.get_next() - if not inputs: - break - inputs_list.append(inputs) - return inputs_list, so - - -def pad_tensor(weight, group_size, k_blocks): - """Pad tensor rowi so that it can be is divisible by group_size. - - Args: - weight (array): weight - group_size (int): how many elements share one scale/zp - k_blocks (int): the number of block - - Returns: - weight: paded weight - """ - if group_size == -1: - return weight - - org_w_shape = weight.shape - padded_rows = k_blocks * group_size - pad_len = padded_rows - org_w_shape[0] - - if pad_len > 0: - weight = np.pad(weight, ((0, pad_len), (0, 0)), "constant") - - return weight - - -def quant_tensor( - data: np.array, - num_bits: int = 4, - group_size: int = 32, - scheme: str = "asym", - dtype: str = "int", - ratio: float = 1.0, -): - """Quantize tensor per group. - - Args: - data (np.array): input weight - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): how many elements share one scale/zp. Defaults to 4. - scheme (str, optional): _quantization scheme. Defaults to "asym". - dtype (str, optional): data type. Defaults to "int". - ratio (float, optional): percentile of clip. Defaults to 1.0. - - Returns: - output: quantized weight - scale: scale - zero_point: zero point - """ - data = np.reshape(data, (-1, group_size)) - if scheme == "asym" or dtype == "uint": - maxq = 2**num_bits - 1 - minq = 0 - elif scheme == "sym": - maxq = 2 ** (num_bits - 1) - 1 if num_bits != 1 else 0 - minq = -(2 ** (num_bits - 1)) if num_bits != 1 else -1 - - rmin = np.min(data, axis=1, keepdims=True) * ratio - rmax = np.max(data, axis=1, keepdims=True) * ratio - if scheme == "sym": - max_range = np.maximum(np.abs(rmin), np.abs(rmax)) - - scale = np.ones(rmax.shape) - mask = max_range > 0 - scale[mask] = (max_range[mask] * 2.0).astype(np.float64) / (maxq - minq) - zero_point = ( - np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1)) - ) - else: - scale = np.ones(rmax.shape) - scale[rmin != rmax] = np.array( - [float(i) / (maxq - minq) for i in (rmax - rmin)[rmin != rmax].flatten().tolist()] - ) - zero_point = ( - ((np.zeros(scale.shape) - rmin) / scale).round() - if dtype == "int" - else np.maximum(0, np.minimum(maxq, ((np.zeros(scale.shape) - rmin) / scale).round())).astype("uint8") - ) - q_weight = np.empty_like(data, dtype=scale.dtype) - np.divide(data, scale, out=q_weight) - np.add(q_weight, zero_point, out=q_weight) - np.round(q_weight, out=q_weight) - np.clip(q_weight, minq, maxq, out=q_weight) - - return q_weight, scale, zero_point - - -def qdq_tensor( - data: np.array, - num_bits: int = 4, - group_size: int = 32, - scheme: str = "asym", - dtype: str = "int", - ratio: float = 1.0, -): - """Quant dequant tensor per group. - - Args: - data (np.array): input weight - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): how many elements share one scale/zp. Defaults to 32. - scheme (str, optional): quantization scheme. Defaults to "asym". - dtype (str, optional): data type. Defaults to "int". - ratio (float, optional): percentile of clip. Defaults to 1.0. - - Returns: - output: quant-dequant weight - """ - org_shape = data.shape - weight, scale, zp = quant_tensor(data, num_bits, group_size, scheme, dtype, ratio) - return np.reshape(scale * (weight - zp), org_shape) diff --git a/onnx_neural_compressor/config.py b/onnx_neural_compressor/config.py deleted file mode 100644 index b6fad923a..000000000 --- a/onnx_neural_compressor/config.py +++ /dev/null @@ -1,1239 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import enum -import inspect -import itertools -import json -import pathlib -import re -from abc import ABC, abstractmethod - -import numpy as np -import onnx -import pydantic -from onnxruntime import quantization -from typing_extensions import Self - -from onnx_neural_compressor import constants, data_reader, logger, utility - -from collections import OrderedDict # isort: skip -from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, _GenericAlias # isort: skip - - -class ParamLevel(enum.Enum): - OP_LEVEL = enum.auto() - OP_TYPE_LEVEL = enum.auto() - MODEL_LEVEL = enum.auto() - - -class TuningParam: - """Define the tunable parameter for the algorithm. - - Example: - Class FakeAlgoConfig(config.BaseConfig): - '''Fake algo config.'''. - - params_list = [ - ... - # For simple tunable types, like a list of int, giving - # the param name is enough. `config.BaseConfig` class will - # create the `TuningParam` implicitly. - "simple_attr" - - # For complex tunable types, like a list of lists, - # developers need to create the `TuningParam` explicitly. - TuningParam("complex_attr", tunable_type=List[List[str]]) - - # The default parameter level is `ParamLevel.OP_LEVEL`. - # If the parameter is at a different level, developers need - # to specify it explicitly. - TuningParam("model_attr", level=ParamLevel.MODEL_LEVEL) - - ... - - # TODO: more examples to explain the usage of `TuningParam`. - """ - - def __init__( - self, - name: str, - default_val: Any = None, - tunable_type=None, - options=None, - level: ParamLevel = ParamLevel.OP_LEVEL, - ) -> None: - self.name = name - self.default_val = default_val - self.tunable_type = tunable_type - self.options = options - self.level = level - - @staticmethod - def create_input_args_model(expect_args_type: Any) -> type: - """Dynamically create an InputArgsModel based on the provided type hint. - - Parameters: - - expect_args_type (Any): The user-provided type hint for input_args. - - Returns: - - type: The dynamically created InputArgsModel class. - """ - - class DynamicInputArgsModel(pydantic.BaseModel): - input_args: expect_args_type - - return DynamicInputArgsModel - - def is_tunable(self, value: Any) -> bool: - # Use `Pydantic` to validate the input_args. - # TODO: refine the implementation in further. - assert isinstance(self.tunable_type, _GenericAlias), f"Expected a type hint, got {self.tunable_type} instead." - DynamicInputArgsModel = TuningParam.create_input_args_model(self.tunable_type) - try: - new_args = DynamicInputArgsModel(input_args=value) - return True - except Exception as e: - logger.debug(f"Failed to validate the input_args: {e}") - return False - - -# Config registry to store all registered configs. -class ConfigRegistry(object): - registered_configs = {} - _config_registry = None - - def __new__(cls) -> Self: - if cls._config_registry is None: - cls._config_registry = super(ConfigRegistry, cls).__new__(cls) - - return cls._config_registry - - @classmethod - def register_config_impl(cls, algo_name: str, priority: Union[float, int] = 0): - """Register config decorator. - - The register the configuration classes for different algorithms. - - Usage example: - @ConfigRegistry.register_config(algo_name=ExampleAlgorithm, priority=100) - class ExampleAlgorithmConfig: - # Configuration details for the ExampleAlgorithm - - Args: - algo_name: the algorithm name. - priority: priority: the priority of the configuration. A larger number indicates a higher priority, - which will be tried first at the auto-tune stage. Defaults to 0. - """ - - def decorator(config_cls): - cls.registered_configs[algo_name] = {"priority": priority, "cls": config_cls} - return config_cls - - return decorator - - @classmethod - def get_all_configs(cls) -> Dict[str, Dict[str, Dict[str, object]]]: - """Get all registered configurations.""" - return cls.registered_configs - - @classmethod - def get_sorted_configs(cls) -> Dict[str, OrderedDict[str, Dict[str, object]]]: - """Get registered configurations sorted by priority.""" - return OrderedDict(sorted(cls.registered_configs.items(), key=lambda x: x[1]["priority"], reverse=True)) - - @classmethod - def get_cls_configs(cls) -> Dict[str, Dict[str, object]]: - """Get registered configurations without priority.""" - cls_configs = {} - for algo_name, config_data in cls.registered_configs.items(): - cls_configs[algo_name] = config_data["cls"] - return cls_configs - - @classmethod - def get_all_config_cls(cls) -> List[Type[BaseConfig]]: - configs_cls = [] - for algo_name, config_pairs in cls.registered_configs.items(): - configs_cls.append(config_pairs["cls"]) - return configs_cls - - -config_registry = ConfigRegistry() - - -def register_config(algo_name: str, priority: Union[float, int] = 0): - """Register config decorator. - - The register the configuration classes for different algorithms. - - Usage example: - @register_config(algo_name=ExampleAlgorithm, priority=100) - class ExampleAlgorithmConfig: - # Configuration details for the ExampleAlgorithm - - Args: - algo_name: the algorithm name. - priority: the priority of the configuration. A larger number indicates a higher priority, - which will be tried first at the auto-tune stage. Defaults to 0. - """ - - return config_registry.register_config_impl(algo_name=algo_name, priority=priority) - - -class BaseConfig(ABC): - """The base config for all algorithm configs.""" - - name = constants.BASE_CONFIG - params_list: List[Union[str, TuningParam]] = [] - - def __init__( - self, - white_list: Optional[Union[Union[str, Callable], List[Union[str, Callable]]]] = constants.DEFAULT_WHITE_LIST, - ) -> None: - self._global_config: Optional[BaseConfig] = None - # For PyTorch, operator_type is the collective name for module type and functional operation type, - # for example, `torch.nn.Linear`, and `torch.nn.functional.linear`. - # local config is the collections of operator_type configs and operator configs - self._local_config: Dict[str, Optional[BaseConfig]] = {} - self._white_list = white_list - - def _post_init(self): - if self.white_list == constants.DEFAULT_WHITE_LIST: - global_config = self.get_params_dict() - self._global_config = self.__class__(**global_config, white_list=None) - elif isinstance(self.white_list, list) and len(self.white_list) > 0: - for op_name_or_type in self.white_list: - global_config = self.get_params_dict() - tmp_config = self.__class__(**global_config, white_list=None) - self.set_local(op_name_or_type, tmp_config) - elif self.white_list == constants.EMPTY_WHITE_LIST: - return - else: - raise NotImplementedError( - f"The white list should be one of {constants.DEFAULT_WHITE_LIST}, {constants.EMPTY_WHITE_LIST}," - " a not empty list, but got {self.white_list}" - ) - - @property - def white_list(self): - return self._white_list - - @white_list.setter - def white_list(self, op_name_or_type_list: Optional[List[Union[str, Callable]]]): - self._white_list = op_name_or_type_list - - @property - def global_config(self): - return self._global_config - - @global_config.setter - def global_config(self, config): - self._global_config = config - - @property - def local_config(self): - return self._local_config - - @local_config.setter - def local_config(self, config): - self._local_config = config - - def set_local(self, operator_name: str, config: BaseConfig) -> BaseConfig: - if operator_name in self.local_config: - logger.warning("The configuration for %s has already been set, update it.", operator_name) - self.local_config[operator_name] = config - return self - - def to_dict(self): - result = {} - global_config = self.get_params_dict() - if bool(self.local_config): - result[constants.LOCAL] = {} - for op_name, config in self.local_config.items(): - result[constants.LOCAL][op_name] = config.to_dict() - if self.global_config: - result[constants.GLOBAL] = global_config - else: - result = global_config - return result - - def get_params_dict(self): - result = dict() - for param, value in self.__dict__.items(): - if param not in ["_global_config", "_local_config", "_white_list"]: - result[param] = value - return result - - @classmethod - def from_dict(cls, config_dict): - """Construct config from a dict. - - Args: - config_dict: _description_ - - Returns: - The constructed config. - """ - if constants.GLOBAL not in config_dict and constants.LOCAL not in config_dict: - config = cls(**config_dict) - return config - else: - config = cls(**config_dict.get(constants.GLOBAL, {})) - operator_config = config_dict.get(constants.LOCAL, {}) - if operator_config: - for op_name, op_config in operator_config.items(): - config.set_local(op_name, cls(**op_config)) - return config - - @classmethod - def to_diff_dict(cls, instance) -> Dict[str, Any]: - # TODO (Yi) to implement it - return {} - - @classmethod - def from_json_file(cls, filename): - with open(filename, "r", encoding="utf-8") as file: - config_dict = json.load(file) - return cls.from_dict(**config_dict) - - def to_json_file(self, filename): - config_dict = self.to_dict() - with open(filename, "w", encoding="utf-8") as file: - json.dump(config_dict, file, indent=4) - logger.info("Dump the config into %s.", filename) - - def to_json_string(self, use_diff: bool = False) -> Union[str, Dict]: - """Serializes this instance to a JSON string. - - Args: - use_diff (`bool`, *optional*, defaults to `True`): - If set to `True`, only the difference between the config instance and the default `BaseConfig()` - is serialized to JSON string. - - Returns: - `str`: String containing all the attributes that make up this configuration instance in JSON format. - """ - if use_diff is True: - config_dict = self.to_diff_dict(self) - else: - config_dict = self.to_dict() - try: - return json.dumps(config_dict, indent=2) + "\n" - except Exception as e: - logger.error("Failed to serialize the config to JSON string: %s", e) - return config_dict - - def __repr__(self) -> str: - return f"{self.__class__.__name__} {self.to_json_string()}" - - @classmethod - @abstractmethod - def register_supported_configs(cls): - """Add all supported configs.""" - raise NotImplementedError - - @classmethod - def validate(self, user_config: BaseConfig): - # TODO validate the user config - pass - - def __add__(self, other: BaseConfig) -> BaseConfig: - if isinstance(other, type(self)): - for op_name, config in other.local_config.items(): - self.set_local(op_name, config) - return self - else: - return ComposableConfig(configs=[self, other]) - - @staticmethod - def get_the_default_value_of_param(config: BaseConfig, param: str) -> Any: - # Get the signature of the __init__ method - signature = inspect.signature(config.__init__) - - # Get the parameters and their default values - parameters = signature.parameters - return parameters.get(param).default - - def expand(self) -> List[BaseConfig]: - """Expand the config. - - case 1 - { - "global": { "weight_bits": [4, 6]} - } - expand to : - 1st trial config: - { - "global": { "weight_bits": 4} - } - 2nd trial config: - { - "global": { "weight_bits": 6} - } - case 2 - # TODO to support the expansion of config with `local` - { - "global": { - "weight_bits": [4, 6] - }, - "local": - { - "fc1":{ - "weight_bits": [6, 8] - }, - "fc2":{ - "weight_bits": [4] - } - } - - } -> ? - """ - config_list: List[BaseConfig] = [] - params_list = self.params_list - config = self - tuning_param_list = [] - not_tuning_param_pair = {} # key is the param name, value is the user specified value - for param in params_list: - # Create `tuning.TuningParam` for each param - # There are two cases: - # 1. The param is a string. - # 2. The param is a `tuning.TuningParam` instance. - if isinstance(param, str): - default_param = self.get_the_default_value_of_param(config, param) - tuning_param = TuningParam(name=param, tunable_type=List[type(default_param)]) - elif isinstance(param, TuningParam): - tuning_param = param - else: - raise ValueError(f"Unsupported param type: {param}") - # Assign the options to the `tuning.TuningParam` instance - param_val = getattr(config, tuning_param.name) - if param_val is not None: - if tuning_param.is_tunable(param_val): - tuning_param.options = param_val - tuning_param_list.append(tuning_param) - else: - not_tuning_param_pair[tuning_param.name] = param_val - logger.debug("Tuning param list: %s", tuning_param_list) - logger.debug("Not tuning param pair: %s", not_tuning_param_pair) - if len(tuning_param_list) == 0: - config_list = [config] - else: - tuning_param_name_lst = [tuning_param.name for tuning_param in tuning_param_list] - for params_values in itertools.product(*[tuning_param.options for tuning_param in tuning_param_list]): - tuning_param_pair = dict(zip(tuning_param_name_lst, params_values)) - tmp_params_dict = {**not_tuning_param_pair, **tuning_param_pair} - new_config = self.__class__(**tmp_params_dict) - logger.info(new_config.to_dict()) - config_list.append(new_config) - logger.info("Expanded the %s and got %d configs.", self.__class__.name, len(config_list)) - return config_list - - def _get_op_name_op_type_config(self): - op_type_config_dict = dict() - op_name_config_dict = dict() - for name, config in self.local_config.items(): - if self._is_op_type(name): - op_type_config_dict[name] = config - else: - op_name_config_dict[name] = config - return op_type_config_dict, op_name_config_dict - - def to_config_mapping( - self, config_list: Optional[List[BaseConfig]] = None, model_info: List[Tuple[str, str]] = None - ) -> OrderedDict[Tuple[str, str], OrderedDict[str, BaseConfig]]: - config_mapping = OrderedDict() - if config_list is None: - config_list = [self] - for config in config_list: - global_config = config.global_config - op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() - for op_name, op_type in model_info: - if self.global_config is not None: - config_mapping[(op_name, op_type)] = global_config - if op_type in op_type_config_dict: - config_mapping[(op_name, op_type)] = op_name_config_dict[op_type] - for op_name_pattern in op_name_config_dict: - if isinstance(op_name, str) and re.match(op_name_pattern, op_name): - config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern] - elif op_name_pattern == op_name: - config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern] - return config_mapping - - @staticmethod - def _is_op_type(name: str) -> bool: - # * Ort and TF may override this method. - return not isinstance(name, str) - - @classmethod - @abstractmethod - def get_config_set_for_tuning(cls): - raise NotImplementedError - - -class ComposableConfig(BaseConfig): - name = constants.COMPOSABLE_CONFIG - - def __init__(self, configs: List[BaseConfig]) -> None: - self.config_list = configs - - def __add__(self, other: BaseConfig) -> BaseConfig: - if isinstance(other, type(self)): - self.config_list.extend(other.config_list) - else: - self.config_list.append(other) - return self - - def to_dict(self): - result = {} - for config in self.config_list: - result[config.name] = config.to_dict() - return result - - @classmethod - def from_dict(cls, config_dict: OrderedDict[str, Dict], config_registry: Dict[str, BaseConfig]): - assert len(config_dict) >= 1, "The config dict must include at least one configuration." - num_configs = len(config_dict) - name, value = next(iter(config_dict.items())) - config = config_registry[name].from_dict(value) - for _ in range(num_configs - 1): - name, value = next(iter(config_dict.items())) - config += config_registry[name].from_dict(value) - return config - - def to_json_string(self, use_diff: bool = False) -> str: - return json.dumps(self.to_dict(), indent=2) + "\n" - - def __repr__(self) -> str: - return f"{self.__class__.__name__} {self.to_json_string()}" - - def to_config_mapping( - self, config_list: List[BaseConfig] = None, model_info: Dict[str, Any] = None - ) -> OrderedDict[str, BaseConfig]: - config_mapping = OrderedDict() - for config in self.config_list: - op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() - single_config_model_info = model_info.get(config.name, None) - for op_name, op_type in single_config_model_info: - if op_type in op_type_config_dict: - config_mapping[(op_name, op_type)] = op_name_config_dict[op_type] - for op_name_pattern in op_name_config_dict: - if re.match(op_name_pattern, op_name): - config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern] - return config_mapping - - @classmethod - def register_supported_configs(cls): - """Add all supported configs.""" - raise NotImplementedError - - @classmethod - def get_config_set_for_tuning(cls) -> None: - # TODO (Yi) handle the composable config in `tuning_config` - return None - - def get_model_info(self, model, *args, **kwargs): - model_info_dict = dict() - for config in self.config_list: - model_info_dict.update({config.name: config.get_model_info(model, *args, **kwargs)}) - return model_info_dict - - -def get_all_config_set_from_config_registry() -> List[BaseConfig]: - all_registered_config_cls: List[Type[BaseConfig]] = config_registry.get_all_config_cls() - config_set = [] - for config_cls in all_registered_config_cls: - config_set.append(config_cls.get_config_set_for_tuning()) - return config_set - - -def register_supported_configs(): - """Register supported configs.""" - all_registered_config_cls: List[Type[BaseConfig]] = config_registry.get_all_config_cls() - for config_cls in all_registered_config_cls: - config_cls.register_supported_configs() - - -class _OperatorConfig(NamedTuple): - config: BaseConfig - operators: List[Union[str, Callable]] - valid_func_list: List[Callable] = [] - - -######################## RNT Config ############################### - - -@register_config(algo_name=constants.RTN, priority=constants.PRIORITY_RTN) -class RTNConfig(BaseConfig): - """Config class for round-to-nearest weight-only quantization.""" - - supported_configs: List[_OperatorConfig] = [] - params_list: List[Union[str, TuningParam]] = [ - "weight_dtype", - "weight_bits", - "weight_group_size", - "weight_sym", - "act_dtype", - "accuracy_level", - "ratios", - ] - model_params_list: List[str] = [ - "providers", - "layer_wise_quant", - ] - name: str = constants.RTN - - def __init__( - self, - weight_dtype: str = "int", - weight_bits: int = 4, - weight_group_size: int = 32, - weight_sym: bool = True, - act_dtype: str = "fp32", - accuracy_level: int = 0, - ratios: dict = {}, - providers: List[str] = ["CPUExecutionProvider"], - layer_wise_quant: bool = False, - quant_last_matmul: bool = True, - white_list: List[Union[str, Callable]] = constants.DEFAULT_WHITE_LIST, - ): - """Init RTN weight-only quantization config. - - Args: - weight_dtype (str, optional): Data type for weights, default is "int". - weight_bits (int, optional): Number of bits used to represent weights, default is 4. - weight_group_size (int, optional): Size of weight groups, default is 32. - weight_sym (bool, optional): Indicates whether weights are symmetric, default is True. - act_dtype (str, optional): Data type for activations, default is "fp32". - accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), - 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), - 4 (int8 compute type of jblas kernel). Defaults to 0. - ratios (dict, optional): percentile of clip. Defaults to {}. - providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"]. - layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint. - Check below link for details - https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md, - default is False. - quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True. - white_list (list, optional): op in white_list will be applied current config. - Defaults to constants.DEFAULT_WHITE_LIST. - """ - super().__init__(white_list=white_list) - self.weight_bits = weight_bits - self.weight_dtype = weight_dtype - self.weight_group_size = weight_group_size - self.weight_sym = weight_sym - self.act_dtype = act_dtype - self.accuracy_level = accuracy_level - self.ratios = ratios - self.providers = providers - self.layer_wise_quant = layer_wise_quant - self.quant_last_matmul = quant_last_matmul - self._post_init() - - def get_model_params_dict(self): - result = dict() - for param in self.model_params_list: - result[param] = getattr(self, param) - return result - - @classmethod - def register_supported_configs(cls) -> None: - supported_configs = [] - linear_rtn_config = RTNConfig( - weight_dtype=["int"], - weight_bits=[1, 2, 3, 4, 5, 6, 7, 8], - weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024], - weight_sym=[True, False], - act_dtype=["fp32"], - ) - operators = ["MatMul"] - supported_configs.append(_OperatorConfig(config=linear_rtn_config, operators=operators)) - cls.supported_configs = supported_configs - - def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: list = None): - config_mapping = OrderedDict() - if config_list is None: - config_list = [self] - for config in config_list: - # update model level setting - config_mapping.update(config.get_model_params_dict()) - - # update node level setting - global_config = config.global_config - op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() - for op_name, op_type in model_info: - if self.global_config is not None: - config_mapping[(op_name, op_type)] = global_config - if op_type in op_type_config_dict: - config_mapping[(op_name, op_type)] = op_name_config_dict[op_type] - for op_name_pattern in op_name_config_dict: - if re.match(op_name_pattern, op_name): - config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern] - if not self.quant_last_matmul: - config_mapping[model_info[-1]] = { - "weight": {"dtype": "fp32"}, - "activation": {"dtype": "fp32", "quant_mode": "fp32"}, - } - return config_mapping - - @staticmethod - def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str]) -> list: - if not isinstance(model, onnx.ModelProto): - model = onnx.load(model, load_external_data=False) - white_list = ["MatMul"] - filter_result = [] - for node in model.graph.node: - if node.op_type in white_list: - pair = (node.name, node.op_type) - filter_result.append(pair) - logger.debug(f"Get model info: {filter_result}") - return filter_result - - @classmethod - def get_config_set_for_tuning(cls) -> Union[None, "RTNConfig", List["RTNConfig"]]: # pragma: no cover - return RTNConfig(weight_bits=[4, 8], weight_sym=[True, False]) - - -def get_default_rtn_config() -> RTNConfig: - """Generate the default rtn config. - - Returns: - the default rtn config. - """ - return RTNConfig() - - -######################## GPTQ Config ############################### - - -@register_config(algo_name=constants.GPTQ, priority=constants.PRIORITY_GPTQ) -class GPTQConfig(BaseConfig): - """Config class for gptq weight-only quantization.""" - - supported_configs: List[_OperatorConfig] = [] - params_list: List[Union[str, TuningParam]] = [ - "weight_dtype", - "weight_bits", - "weight_group_size", - "weight_sym", - "act_dtype", - "accuracy_level", - ] - model_params_list: List[Union[str, TuningParam]] = [ - "percdamp", - "blocksize", - "actorder", - "mse", - "perchannel", - "providers", - "layer_wise_quant", - ] - name: str = constants.GPTQ - - def __init__( - self, - weight_dtype: str = "int", - weight_bits: int = 4, - weight_group_size: int = 32, - weight_sym: bool = True, - act_dtype: str = "fp32", - accuracy_level: int = 0, - percdamp: float = 0.01, - blocksize: int = 128, - actorder: bool = False, - mse: bool = False, - perchannel: bool = True, - providers: List[str] = ["CPUExecutionProvider"], - layer_wise_quant: bool = False, - quant_last_matmul: bool = True, - white_list: List[Union[str, Callable]] = constants.DEFAULT_WHITE_LIST, - ): - """Init GPTQ weight-only quantization config. - - Args: - weight_dtype (str, optional): data type for weights. Defaults to "int". - weight_bits (int, optional): number of bits used to represent weights. Defaults to 4. - weight_group_size (int, optional): size of weight groups. Defaults to 32. - weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True. - act_dtype (str, optional): data type for activations. Defaults to "fp32". - accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), - 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), - 4 (int8 compute type of jblas kernel). Defaults to 0. - percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added - to Hessian's diagonal to increase numerical stability. Defaults to 0.01. - blocksize (int, optional): execute GPTQ quantization per block. Defaults to 128. - actorder (bool, optional): whether to sort Hessian's diagonal values to rearrange channel-wise - quantization order. Defaults to False. - mse (bool, optional): whether get scale and zero point with mse error. Defaults to False. - perchannel (bool, optional): whether quantize weight per-channel. Defaults to True. - providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"]. - layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint. - Check below link for details - https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md, - default is False. - quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True. - white_list (list, optional): op in white_list will be applied current config. - Defaults to constants.DEFAULT_WHITE_LIST. - """ - super().__init__(white_list=white_list) - self.weight_bits = weight_bits - self.weight_dtype = weight_dtype - self.weight_group_size = weight_group_size - self.weight_sym = weight_sym - self.act_dtype = act_dtype - self.accuracy_level = accuracy_level - self.percdamp = percdamp - self.blocksize = blocksize - self.actorder = actorder - self.mse = mse - self.perchannel = perchannel - self.providers = providers - self.layer_wise_quant = layer_wise_quant - self.quant_last_matmul = quant_last_matmul - self._post_init() - - def get_model_params_dict(self): - result = dict() - for param in self.model_params_list: - result[param] = getattr(self, param) - return result - - @classmethod - def register_supported_configs(cls) -> None: - supported_configs = [] - linear_gptq_config = GPTQConfig( - weight_dtype=["int"], - weight_bits=[1, 2, 3, 4, 5, 6, 7, 8], - weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024], - weight_sym=[True, False], - act_dtype=["fp32"], - actorder=[True, False], - mse=[True, False], - perchannel=[True, False], - ) - operators = ["MatMul"] - supported_configs.append(_OperatorConfig(config=linear_gptq_config, operators=operators)) - cls.supported_configs = supported_configs - - def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict: - config_mapping = OrderedDict() - if config_list is None: - config_list = [self] - for config in config_list: - # update model level setting - config_mapping.update(config.get_model_params_dict()) - - # update node level setting - global_config = config.global_config - op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() - for op_name, op_type in model_info: - if self.global_config is not None: - config_mapping[(op_name, op_type)] = global_config - if op_type in op_type_config_dict: - config_mapping[(op_name, op_type)] = op_name_config_dict[op_type] - for op_name_pattern in op_name_config_dict: - if re.match(op_name_pattern, op_name): - config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern] - if not self.quant_last_matmul: - config_mapping[model_info[-1]] = { - "weight": {"dtype": "fp32"}, - "activation": {"dtype": "fp32", "quant_mode": "fp32"}, - } - return config_mapping - - @staticmethod - def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str]) -> list: - if not isinstance(model, onnx.ModelProto): - model = onnx.load(model, load_external_data=False) - white_list = ["MatMul"] - filter_result = [] - for node in model.graph.node: - if node.op_type in white_list: - pair = (node.name, node.op_type) - filter_result.append(pair) - logger.debug(f"Get model info: {filter_result}") - return filter_result - - @classmethod - def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig"]]: # pragma: no cover - return GPTQConfig( - weight_bits=[4, 8], - weight_sym=[True, False], - actorder=[True, False], - mse=[True, False], - perchannel=[True, False], - ) - - -def get_default_gptq_config() -> GPTQConfig: - """Generate the default gptq config. - - Returns: - the default gptq config. - """ - return GPTQConfig() - - -######################## AWQ Config ############################### - - -@register_config(algo_name=constants.AWQ, priority=constants.PRIORITY_AWQ) -class AWQConfig(BaseConfig): - """Config class for awq weight-only quantization.""" - - supported_configs: List[_OperatorConfig] = [] - params_list: List[str] = [ - "weight_dtype", - "weight_bits", - "weight_group_size", - "weight_sym", - "act_dtype", - "accuracy_level", - ] - model_params_list: List[str] = [ - "enable_auto_scale", - "enable_mse_search", - "providers", - ] - name: str = constants.AWQ - - def __init__( - self, - weight_dtype: str = "int", - weight_bits: int = 4, - weight_group_size: int = 32, - weight_sym: bool = True, - act_dtype: str = "fp32", - accuracy_level: int = 0, - enable_auto_scale: bool = True, - enable_mse_search: bool = True, - providers: List[str] = ["CPUExecutionProvider"], - quant_last_matmul: bool = True, - white_list: List[Union[str, Callable]] = constants.DEFAULT_WHITE_LIST, - ): - """Init AWQ weight-only quantization config. - - Args: - weight_dtype (str, optional): data type for weights. Defaults to "int". - weight_bits (int, optional): number of bits used to represent weights. Defaults to 4. - weight_group_size (int, optional): size of weight groups. Defaults to 32. - weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True. - act_dtype (str, optional): data type for activations. Defaults to "fp32". - accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), - 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), - 4 (int8 compute type of jblas kernel). Defaults to 0. - enable_auto_scale (bool, optional): whether to search for best scales based on activation distribution. - Defaults to True. - enable_mse_search (bool, optional): whether to search for the best clip range from range - [0.91, 1.0, 0.01]. Defaults to True. - providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"]. - quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True. - white_list (list, optional): op in white_list will be applied current config. - Defaults to constants.DEFAULT_WHITE_LIST. - """ - super().__init__(white_list=white_list) - self.weight_bits = weight_bits - self.weight_dtype = weight_dtype - self.weight_group_size = weight_group_size - self.weight_sym = weight_sym - self.act_dtype = act_dtype - self.accuracy_level = accuracy_level - self.enable_auto_scale = enable_auto_scale - self.enable_mse_search = enable_mse_search - self.providers = providers - self.quant_last_matmul = quant_last_matmul - self._post_init() - - def get_model_params_dict(self): - result = dict() - for param in self.model_params_list: - result[param] = getattr(self, param) - return result - - @classmethod - def register_supported_configs(cls) -> List[_OperatorConfig]: - supported_configs = [] - linear_awq_config = AWQConfig( - weight_dtype=["int"], - weight_bits=[1, 2, 3, 4, 5, 6, 7, 8], - weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024], - weight_sym=[True, False], - act_dtype=["fp32"], - enable_auto_scale=[True, False], - enable_mse_search=[True, False], - ) - operators = ["MatMul"] - supported_configs.append(_OperatorConfig(config=linear_awq_config, operators=operators)) - cls.supported_configs = supported_configs - - def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict: - config_mapping = OrderedDict() - if config_list is None: - config_list = [self] - for config in config_list: - # update model level setting - config_mapping.update(config.get_model_params_dict()) - - # update node level setting - global_config = config.global_config - op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() - for op_name, op_type in model_info: - if self.global_config is not None: - config_mapping[(op_name, op_type)] = global_config - if op_type in op_type_config_dict: - config_mapping[(op_name, op_type)] = op_name_config_dict[op_type] - for op_name_pattern in op_name_config_dict: - if re.match(op_name_pattern, op_name): - config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern] - if not self.quant_last_matmul: - config_mapping[model_info[-1]] = { - "weight": {"dtype": "fp32"}, - "activation": {"dtype": "fp32", "quant_mode": "fp32"}, - } - return config_mapping - - @staticmethod - def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str]) -> list: - if not isinstance(model, onnx.ModelProto): - model = onnx.load(model, load_external_data=False) - white_list = ["MatMul"] - filter_result = [] - for node in model.graph.node: - if node.op_type in white_list: - pair = (node.name, node.op_type) - filter_result.append(pair) - logger.debug(f"Get model info: {filter_result}") - return filter_result - - @classmethod - def get_config_set_for_tuning(cls) -> Union[None, "AWQConfig", List["AWQConfig"]]: # pragma: no cover - return AWQConfig( - weight_bits=[4, 8], - weight_sym=[True, False], - enable_auto_scale=[True, False], - enable_mse_search=[True, False], - ) - - -def get_default_awq_config() -> AWQConfig: - """Generate the default awq config. - - Returns: - the default awq config. - """ - return AWQConfig() - - -######################## SmoohQuant Config ############################### - - -@register_config(algo_name=constants.SMOOTH_QUANT, priority=constants.PRIORITY_SMOOTH_QUANT) -class SmoothQuantConfig(BaseConfig, quantization.StaticQuantConfig): - """Smooth quant quantization config.""" - - supported_configs: List[_OperatorConfig] = [] - params_list: List[str] = [ - # smooth parameters - "alpha", - "folding", - "auto_alpha_args", - "calib_iter", - "scales_per_op", - ] - name: str = constants.SMOOTH_QUANT - - def __init__( - self, - alpha: float = 0.5, - folding: bool = True, - op_types: List[str] = ["Gemm", "Conv", "MatMul", "FusedConv"], - calib_iter: int = 100, - scales_per_op: bool = True, - auto_alpha_args: dict = {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}, - providers: List[str] = ["CPUExecutionProvider"], - white_list: List[Union[str, Callable]] = constants.DEFAULT_WHITE_LIST, - **kwargs, - ): - """Init smooth quant config. - - Args: - alpha (float, optional): alpha value to balance the quantization difficulty of activation and weight. - Defaults to 0.5. - folding (bool, optional): whether fold those foldable Mul which are inserted for smooth quant. - Defaults to True. - op_types (list, optional): the op type to be smooth quantized. - Defaults to ["Gemm", "Conv", "MatMul", "FusedConv"]. - calib_iter (int, optional): iteration num for calibration. Defaults to 100. - scales_per_op (bool, optional): True, each op will have an individual scale, mainlyfor accuracy. - False, ops with the same input will share a scale, mainly for performance. Defaults to True. - auto_alpha_args (dict, optional): settings for alpha tuning. - Defaults to {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}. - providers (list, optional): providers used for inference. - Defaults to ["CPUExecutionProvider"]. - white_list (list, optional): op in white_list will be applied current config. - Defaults to constants.DEFAULT_WHITE_LIST. - kwargs (dict): kwargs in below link are supported except calibration_data_reader: - https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/quantize.py#L78 - """ - BaseConfig.__init__(self) - kwargs.update({"calibration_data_reader": None}) - quantization.StaticQuantConfig.__init__(self, **kwargs) - self.alpha = alpha - self.folding = folding - self.op_types = op_types - self.calib_iter = calib_iter - self.scales_per_op = scales_per_op - self.auto_alpha_args = auto_alpha_args - self.providers = providers - self.white_list = white_list - self.weight_type = self.weight_type.value if isinstance(self.weight_type, enum.Enum) else self.weight_type - self.activation_type = ( - self.activation_type.value if isinstance(self.activation_type, enum.Enum) else self.activation_type - ) - self.calibrate_method = ( - self.calibrate_method.value if isinstance(self.calibrate_method, enum.Enum) else self.calibrate_method - ) - self.quant_format = self.quant_format.value if isinstance(self.quant_format, enum.Enum) else self.quant_format - self._post_init() - - @classmethod - def register_supported_configs(cls) -> List[_OperatorConfig]: - supported_configs = [] - smooth_quant_config = SmoothQuantConfig() - operators = ["Gemm", "Conv", "MatMul", "FusedConv"] - supported_configs.append(_OperatorConfig(config=smooth_quant_config, operators=operators)) - cls.supported_configs = supported_configs - - @staticmethod - def get_model_info(model) -> list: - white_list = ["Gemm", "Conv", "MatMul", "FusedConv"] - filter_result = [] - for node in model.graph.node: - if node.op_type in white_list: - pair = (node.name, node.op_type) - filter_result.append(pair) - logger.debug(f"Get model info: {filter_result}") - return filter_result - - @classmethod - def get_config_set_for_tuning( - cls, - ) -> Union[None, "SmoothQuantConfig", List["SmoothQuantConfig"]]: # pragma: no cover - return SmoothQuantConfig(alpha=np.arange(0.3, 0.7, 0.05)) - - def convert_to_ort_config(self): - self.activation_type = quantization.QuantType(self.activation_type) - self.weight_type = quantization.QuantType(self.weight_type) - self.weight_type = quantization.QuantType(self.weight_type) - self.calibrate_method = quantization.CalibrationMethod(self.calibrate_method) - self.quant_format = quantization.QuantFormat(self.quant_format) - - -def get_default_sq_config() -> SmoothQuantConfig: - """Generate the default smooth quant config. - - Returns: - the default smooth quant config. - """ - return SmoothQuantConfig() - - -######################## WOQ Tuning Config ############################### - - -def get_woq_tuning_config() -> list: - """Generate the config set for WOQ tuning. - - Returns: - the list of WOQ quant config. - """ - RTN_G32ASYM = RTNConfig(weight_sym=False) - GPTQ_G32ASYM = GPTQConfig(weight_sym=False) - GPTQ_G32ASYM_DISABLE_LAST_MATMUL = GPTQConfig(weight_sym=False, quant_last_matmul=False) - GPTQ_G128ASYM = GPTQConfig(weight_group_size=128, weight_sym=False) - AWQ_G32ASYM = AWQConfig(weight_sym=False) - return [RTN_G32ASYM, GPTQ_G32ASYM, GPTQ_G32ASYM_DISABLE_LAST_MATMUL, GPTQ_G128ASYM, AWQ_G32ASYM] - - -##################### INC Algo Configs End ################################### - -register_supported_configs() - -##################### Config for ONNXRuntime-like user-facing API ############ - - -class StaticQuantConfig(quantization.StaticQuantConfig): - - def __init__(self, calibration_data_reader: data_reader.CalibrationDataReader, extra_options=None, *args, **kwargs): - """This is a class for static Quant Configuration. - - Inherit from StaticQuantConfig: - https://github.com/microsoft/onnxruntime/blob/v1.17.1/onnxruntime/python/tools/quantization/quantize.py#L78 - extra_options: - Support smoothquant args. - - SmoothQuant = True/False : - Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do - fake input channel quantization. - - SmoothQuantAlpha = float : - Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight - and activation quantization. A larger alpha value could be used on models with more significant - activation outliers to migrate more quantization difficulty to weights. - - SmoothQuantFolding = True/False : - Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during - SmoothQuant will be folded into the previous op if the previous op is foldable. - - SmoothQuantOpTypes = list (new args): - Default is ["Gemm", "Conv", "MatMul", "FusedConv"]. It only works if SmoothQuant is True. - It controls the op types to be smooth quantized. - - SmoothQuantCalibIter = int (new args): - Default is 100. It only works if SmoothQuant is True. It controls the iteration num for calibration. - - SmoothQuantScalesPerOp = True/False (new args) : - Default is True. It only works if SmoothQuant is True. - If enabled, each op will have an individual scale, mainlyfor accuracy. - If not enabled, ops with the same input will share a scale, mainly for performance. - """ - super().__init__(calibration_data_reader=calibration_data_reader, extra_options=extra_options, *args, **kwargs) - - def to_dict(self): - return self.__dict__ - - -class DynamicQuantConfig(quantization.DynamicQuantConfig): - """This is a class for dynamic Quant Configuration. - - Inherit from DynamicQuantConfig: - https://github.com/microsoft/onnxruntime/blob/v1.17.1/onnxruntime/python/tools/quantization/quantize.py#L206 - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - -def generate_nc_sq_config(quant_config: quantization.StaticQuantConfig): - extra_options = quant_config.extra_options - quant_kwargs = { - "alpha": extra_options.get("SmoothQuantAlpha", 0.5), - "folding": extra_options.get("SmoothQuantFolding", True), - "op_types": extra_options.get("SmoothQuantOpTypes", ["Gemm", "Conv", "MatMul", "FusedConv"]), - "calib_iter": extra_options.get("SmoothQuantCalibIter", 100), - "scales_per_op": extra_options.get("SmoothQuantScalesPerOp", True), - } - quant_config.extra_options["SmoothQuant"] = False - quant_config_dict = quant_config.to_dict() - nc_sq_config = SmoothQuantConfig(**quant_kwargs, **quant_config_dict) - return nc_sq_config diff --git a/onnx_neural_compressor/constants.py b/onnx_neural_compressor/constants.py index d2e0391c6..71caf2a49 100644 --- a/onnx_neural_compressor/constants.py +++ b/onnx_neural_compressor/constants.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -# # Copyright (c) 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -30,6 +28,7 @@ COMPOSABLE_CONFIG = "composable_config" RTN = "rtn" STATIC_QUANT = "static_quant" +DYNAMIC_QUANT = "dynamic_quant" SMOOTH_QUANT = "smooth_quant" GPTQ = "gptq" AWQ = "awq" @@ -44,7 +43,283 @@ PRIORITY_GPTQ = 70 PRIORITY_AWQ = 50 PRIORITY_SMOOTH_QUANT = 80 +PRIORITY_STATIC_QUANT = 70 +PRIORITY_DYNAMIC_QUANT = 60 MAXIMUM_PROTOBUF = 2147483648 WHITE_MODULE_LIST = ["MatMul", "Conv"] + +RTN_OP_LIST = ["MatMul"] + +AWQ_OP_LIST = ["MatMul"] + +GPTQ_OP_LIST = ["MatMul"] + +DYNAMIC_CPU_OP_LIST = ["FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM"] +DYNAMIC_CUDA_OP_LIST = ["FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM"] +DYNAMIC_DML_OP_LIST = [] +DYNAMIC_DNNL_OP_LIST = ["FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM"] +DYNAMIC_TRT_OP_LIST = [] + +STATIC_QDQ_CPU_OP_LIST = [ + "FusedConv", + "Conv", + "Gather", + "GatherElements", + "GatherND", + "Tile", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", +] +STATIC_QDQ_CUDA_OP_LIST = [ + "FusedConv", + "Conv", + "Gather", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", +] +STATIC_QDQ_DML_OP_LIST = [ + "Conv", + "MatMul", + "Relu", + "Clip", + "MaxPool", +] +STATIC_QDQ_DNNL_OP_LIST = [ + "FusedConv", + "Conv", + "Gather", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "Resize", +] +STATIC_QDQ_TRT_OP_LIST = [ + "Conv", + "MatMul", + "Attention", + "LeakyRelu", + "Gather", + "Sigmoid", + "MaxPool", + "EmbedLayerNormalization", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "Resize", + "Gemm", + "Add", +] + +STATIC_QOPERATOR_CPU_OP_LIST = [ + "FusedConv", + "Conv", + "Gather", + "GatherElements", + "GatherND", + "Tile", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Mul", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Add", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "ArgMax", + "Resize", + "Abs", + "Shrink", + "Sign", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", +] +STATIC_QOPERATOR_CUDA_OP_LIST = [ + "FusedConv", + "Conv", + "Gather", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Mul", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Add", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "ArgMax", + "Resize", + "Abs", + "Shrink", + "Sign", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", +] +STATIC_QOPERATOR_DML_OP_LIST = [ + "Conv", + "MatMul", + "Mul", + "Relu", + "Clip", + "MaxPool", + "Add", +] +STATIC_QOPERATOR_DNNL_OP_LIST = [ + "FusedConv", + "Conv", + "Gather", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Mul", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Add", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "ArgMax", + "Resize", +] +STATIC_QOPERATOR_TRT_OP_LIST = [] + +STATIC_QOPERATOR_OP_LIST_MAP = { + "CPUExecutionProvider": STATIC_QOPERATOR_CPU_OP_LIST, + "CUDAExecutionProvider": STATIC_QOPERATOR_CUDA_OP_LIST, + "DmlExecutionProvider": STATIC_QOPERATOR_DML_OP_LIST, + "DnnlExecutionProvider": STATIC_QOPERATOR_DNNL_OP_LIST, + "TensorrtExecutionProvider": STATIC_QOPERATOR_TRT_OP_LIST, +} + +STATIC_QDQ_OP_LIST_MAP = { + "CPUExecutionProvider": STATIC_QDQ_CPU_OP_LIST, + "CUDAExecutionProvider": STATIC_QDQ_CUDA_OP_LIST, + "DmlExecutionProvider": STATIC_QDQ_DML_OP_LIST, + "DnnlExecutionProvider": STATIC_QDQ_DNNL_OP_LIST, + "TensorrtExecutionProvider": STATIC_QDQ_TRT_OP_LIST, +} + +DYNAMIC_OP_LIST_MAP = { + "CPUExecutionProvider": DYNAMIC_CPU_OP_LIST, + "CUDAExecutionProvider": DYNAMIC_CUDA_OP_LIST, + "DmlExecutionProvider": DYNAMIC_DML_OP_LIST, + "DnnlExecutionProvider": DYNAMIC_DNNL_OP_LIST, + "TensorrtExecutionProvider": DYNAMIC_TRT_OP_LIST, +} diff --git a/onnx_neural_compressor/data_reader.py b/onnx_neural_compressor/data_reader.py index 24538ce55..7f76769f0 100644 --- a/onnx_neural_compressor/data_reader.py +++ b/onnx_neural_compressor/data_reader.py @@ -1,7 +1,4 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 Intel Corporation +# Copyright (c) 2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,15 +14,25 @@ import abc -from onnxruntime import quantization +class CalibrationDataReader(metaclass=abc.ABCMeta): + @classmethod + def __subclasshook__(cls, subclass): + return hasattr(subclass, "get_next") and callable(subclass.get_next) or NotImplemented + + @abc.abstractmethod + def get_next(self) -> dict: + """generate the input data dict for ONNXinferenceSession run""" + raise NotImplementedError -class CalibrationDataReader(quantization.CalibrationDataReader): - """Get data for calibration. + def __iter__(self): + return self - We define our CalibrationDataReader based on the class in below link: - https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py#L139 - """ + def __next__(self): + result = self.get_next() + if result is None: + raise StopIteration + return result @abc.abstractmethod def rewind(self): diff --git a/onnx_neural_compressor/onnx_model.py b/onnx_neural_compressor/onnx_model.py index 061f7cad8..c1661f85e 100644 --- a/onnx_neural_compressor/onnx_model.py +++ b/onnx_neural_compressor/onnx_model.py @@ -21,12 +21,11 @@ import onnx import transformers -from onnxruntime.quantization import onnx_model from onnx_neural_compressor import constants, logger, utility -class ONNXModel(onnx_model.ONNXModel): +class ONNXModel: """Build ONNX model.""" def __init__(self, model, **kwargs): @@ -36,27 +35,69 @@ def __init__(self, model, **kwargs): model (str or ModelProto): path to onnx model or loaded ModelProto model object. """ self.model = model if not isinstance(model, str) else onnx.load(model, load_external_data=False) - super().__init__(self.model) - self._model_path = None if not isinstance(model, str) else model self.check_is_large_model() if self._is_large_model and self._model_path is None and not kwargs.get("ignore_warning", False): logger.warning("Model size > 2GB. Please use model path instead of onnx model object to quantize") if self._is_large_model and isinstance(model, str) and kwargs.get("load_external_data", True): - onnx.external_data_helper.load_external_data_for_model(self.model, os.path.dirname(self._model_path)) self._config = None if isinstance(model, str) and os.path.exists(pathlib.Path(model).parent.joinpath("config.json").as_posix()): self._config = transformers.PretrainedConfig.from_pretrained(pathlib.Path(model).parent.as_posix()) self.node_name_counter = {} - self._output_name_to_node = self.output_name_to_node() - self._input_name_to_nodes = self.input_name_to_nodes() + self._output_name_to_node = {} + self._input_name_to_nodes = {} + self._get_output_name_to_node(self.model.graph.node) + self._get_input_name_to_nodes(self.model.graph.node) self._graph_info = {} self._get_graph_info() self._q_config = None + def output_name_to_node(self): + self._output_name_to_node = {} + self._get_output_name_to_node(self.model.graph.node) + return self._output_name_to_node + + def input_name_to_nodes(self): + self._input_name_to_nodes = {} + self._get_input_name_to_nodes(self.model.graph.node) + return self._input_name_to_nodes + + def _get_input_name_to_nodes(self, nodes): + """Get input names of nodes.""" + for node in nodes: + attrs = [ + attr + for attr in node.attribute + if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS + ] + if len(attrs) > 0: + for attr in attrs: + self._get_input_name_to_nodes(attr.g.node) + for input_name in node.input: + if len(input_name.strip()) != 0: + if input_name not in self._input_name_to_nodes: + self._input_name_to_nodes[input_name] = [node] + else: + self._input_name_to_nodes[input_name].append(node) + + def _get_output_name_to_node(self, nodes): + """Get output names of nodes.""" + for node in nodes: + attrs = [ + attr + for attr in node.attribute + if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS + ] + if len(attrs) > 0: + for attr in attrs: + self._get_output_name_to_node(attr.g.node) + for output_name in node.output: + if len(output_name.strip()) != 0: + self._output_name_to_node[output_name] = node + @property def model_path(self): """Return model path.""" @@ -99,6 +140,11 @@ def framework(self): """Return framework.""" return "onnxruntime" + def add_initializer(self, tensor): + """Add a initializer to model.""" + if tensor.name not in [i.name for i in self._model.graph.initializer]: + self._model.graph.initializer.append(tensor) + def add_initializers(self, tensors): """Add initializers to model.""" for tensor in tensors: @@ -127,6 +173,42 @@ def output(self): """Return output of model.""" return [i.name for i in self.model.graph.output] + @property + def model(self): + """Return model itself.""" + return self._model + + @model.setter + def model(self, model): + """Set model itself.""" + self._model = model + self._graph_info = {} + self._get_graph_info() + self._output_name_to_node = {} + self._input_name_to_nodes = {} + self._get_input_name_to_nodes(self._model.graph.node) + self._get_output_name_to_node(self._model.graph.node) + + def nodes(self): + """Return model nodes.""" + return self._model.graph.node + + def initializer(self): + """Return model initializer.""" + return self._model.graph.initializer + + def graph(self): + """Return model graph.""" + return self._model.graph + + def ir_version(self): + """Return model ir_version.""" + return self._model.ir_version + + def opset_import(self): + """Return model opset_import.""" + return self._model.opset_import + def update(self): """Update model info.""" self._graph_info = {} @@ -144,6 +226,10 @@ def _get_graph_info(self): for node in self.model.graph.node: self.graph_info.update({node.name: node.op_type}) + def is_graph_output(self, name): + """Check whether the tensor is the graph output.""" + return name in self.output() + def save(self, root): """Save ONNX model.""" if os.path.split(root)[0] != "" and not os.path.exists(os.path.split(root)[0]): @@ -168,6 +254,53 @@ def save(self, root): output_config_file = pathlib.Path(root).parent.joinpath("config.json").as_posix() self._config.to_json_file(output_config_file, use_diff=False) + def remove_initializer(self, tensor): + """Remove an initializer from model.""" + if tensor in self._model.graph.initializer: + self._model.graph.initializer.remove(tensor) + + def remove_initializers(self, init_to_remove): + """Remove initializers from model.""" + for initializer in init_to_remove: + self.remove_initializer(initializer) + + def get_initializer(self, name): + """ "Find the initializer with specified name.""" + for initializer in self.model.graph.initializer: + if initializer.name == name: + return initializer + return None + + def remove_node(self, node): + """Remove a node from model.""" + if node in self._model.graph.node: + self._model.graph.node.remove(node) + + def remove_nodes(self, nodes_to_remove): + """Remove nodes from model.""" + for node in nodes_to_remove: + self.remove_node(node) + + def add_node(self, node): + """Add a node to model.""" + self._model.graph.node.extend([node]) + + def add_nodes(self, nodes_to_add): + """Add nodes to model.""" + self._model.graph.node.extend(nodes_to_add) + + def get_children(self, node, input_name_to_nodes=None): + """Get children nodes.""" + if input_name_to_nodes is None: + input_name_to_nodes = self._input_name_to_nodes + + children = [] + for output in node.output: + if output in input_name_to_nodes: + for child in input_name_to_nodes[output]: + children.append(child) + return children + def get_initializer_share_num(self, name): """Get the number of shares of initializer.""" num = 0 @@ -186,6 +319,25 @@ def get_node(self, name): return node return None + def get_parent(self, node, idx, output_name_to_node=None): + if output_name_to_node is None: + output_name_to_node = self._output_name_to_node + if len(node.input) <= idx: + return None + + input = node.input[idx] + return output_name_to_node.get(input, None) + + def get_parents(self, node, output_name_to_node=None): + if output_name_to_node is None: + output_name_to_node = self._output_name_to_node + + parents = [] + for input in node.input: + if input in output_name_to_node: + parents.append(output_name_to_node[input]) + return parents + def get_node_by_weight(self, weight_name): """Get a node by its weight name.""" if len(self._input_name_to_nodes) == 0: @@ -277,6 +429,22 @@ def _searcher(tensor_name): assert zo_tensor, "missing zero point for tensor {}".format(tensor) return scale_tensor, zo_tensor + @staticmethod + def replace_node_input(node, old_input_name, new_input_name): + """Replace input of a node.""" + assert isinstance(old_input_name, str) and isinstance(new_input_name, str) + for j in range(len(node.input)): + if node.input[j] == old_input_name: + node.input[j] = new_input_name + + @staticmethod + def replace_node_output(node, old_output_name, new_output_name): + """Replace output of a node.""" + assert isinstance(old_output_name, str) and isinstance(new_output_name, str) + for j in range(len(node.output)): + if node.output[j] == old_output_name: + node.output[j] = new_output_name + def replace_input_of_all_nodes(self, old_input_name, new_input_name, white_optype=[], black_optype=[]): """Replace inputs of all nodes.""" if len(white_optype) > 0: @@ -299,10 +467,21 @@ def replace_output_of_all_nodes(self, old_output_name, new_output_name, white_op if node.op_type not in black_optype: ONNXModel.replace_node_output(node, old_output_name, new_output_name) + def remove_duplicate_nodes(self): + """remove duplicate nodes""" + new_nodes = [] + for node in self.nodes(): + if node not in new_nodes: + new_nodes.append(node) + self.model.graph.ClearField("node") + self.model.graph.node.extend(new_nodes) + self.update() + def remove_unused_nodes(self): """Remove unused nodes.""" unused_nodes = [] nodes = self.nodes() + if len(self._input_name_to_nodes) == 0: self._input_name_to_nodes = self.input_name_to_nodes() if len(self._output_name_to_node) == 0: @@ -314,35 +493,26 @@ def remove_unused_nodes(self): and node.output[0] not in self._input_name_to_nodes ): unused_nodes.append(node) - elif ( - node.op_type == "QuantizeLinear" - and len(self.get_children(node)) == 1 - and self.get_children(node)[0].op_type == "DequantizeLinear" - and node.input[0] not in self._output_name_to_node - and self.get_children(node)[0].output[0] not in self._input_name_to_nodes - ): - unused_nodes.append(node) - unused_nodes.extend(self.get_children(node)) - else: - # remove the node if it does not serve as the input or output of any other nodes - unused = True - for output in node.output: - if output in self._input_name_to_nodes or output in self.output(): - unused = False - break - for input in node.input: - if self.get_initializer(input) is not None: - continue - elif input in self._output_name_to_node or input in self.input(): - unused = False - break - if unused: - unused_nodes.append(node) + self.remove_nodes(unused_nodes) + unvalid_nodes = [ + i + for i in self.model.graph.node + if all(out not in self._input_name_to_nodes and out not in self.output() for out in i.output) + ] + while len(unvalid_nodes) > 0: + self.remove_nodes(unvalid_nodes) + self._input_name_to_nodes = self.input_name_to_nodes() + unvalid_nodes = [ + i + for i in self.model.graph.node + if all([out not in self._input_name_to_nodes and out not in self.output() for out in i.output]) + ] + ununsed_weights = [] for w in self.model.graph.initializer: - if w.name not in self._input_name_to_nodes and w.name not in self.model.graph.output: + if w.name not in self._input_name_to_nodes and w.name not in self.output(): ununsed_weights.append(w) # Remove from graph.input for graph_input in self.graph().input: @@ -351,6 +521,7 @@ def remove_unused_nodes(self): self.remove_initializers(ununsed_weights) self.update() + self.topological_sort() def topological_sort(self, enable_subgraph=False): """Topological sort the model.""" @@ -403,43 +574,6 @@ def topological_sort(self, enable_subgraph=False): self.model.graph.ClearField("node") self.model.graph.node.extend(nodes) - def get_nodes_chain(self, start, stop, result_chain=[]): - """Get nodes chain with given start node and stop node.""" - # process start node list - start_node = collections.deque() - for node in start: - if isinstance(node, str): - start_node.append(node) - elif isinstance(node, onnx.NodeProto): - start_node.append(node.name) - else: - assert False, "'get_nodes_chain' function only support list[string]" "or list[NodeProto] params" - - # process stop node list - stop_node = [] - for node in stop: - if isinstance(node, str): - stop_node.append(node) - elif isinstance(node, onnx.NodeProto): - stop_node.append(node.name) - else: - assert False, "'get_nodes_chain' function only support list[string]" "or list[NodeProto] params" - - while start_node: - node_name = start_node.popleft() - if node_name in stop_node: - continue - if node_name not in result_chain: - result_chain.append(node_name) - else: - continue - - node = utility.find_by_name(node_name, list(self.model.graph.node)) - for parent in self.get_parents(node): - start_node.append(parent.name) - - return result_chain - def find_split_node_for_layer_wise_quantization(self): """Find split node for layer wise quantization.""" # find split nodes of decoder blocks @@ -800,22 +934,7 @@ def split_model_with_node(self, split_node_name, path_of_model_to_split, save_bo # origin model : ... -> node_1 -> split_node -> node_2 -> ... # split model 1: ... -> node_1 -> split_node # split model 2: node_2 -> ... - - # remove nodes which are not followed by other nodes - unvalid_nodes = [ - i - for i in self.model.graph.node - if all(out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output) - ] - while len(unvalid_nodes) > 0: - self.remove_nodes(unvalid_nodes) - self._input_name_to_nodes = self.input_name_to_nodes() - unvalid_nodes = [ - i - for i in self.model.graph.node - if all([out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output]) - ] - self.topological_sort() + self.remove_unused_nodes() split_model_part_1 = onnx.ModelProto() split_model_part_1.CopyFrom(self.model) diff --git a/onnx_neural_compressor/quantization/__init__.py b/onnx_neural_compressor/quantization/__init__.py index 7ef91659a..67e82f0fc 100644 --- a/onnx_neural_compressor/quantization/__init__.py +++ b/onnx_neural_compressor/quantization/__init__.py @@ -12,7 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. - -from onnxruntime.quantization.quant_utils import QuantFormat, QuantType - +from onnx_neural_compressor.quantization.quant_utils import CalibrationMethod, QuantFormat, QuantType from onnx_neural_compressor.quantization.quantize import quantize diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py index cd079932c..12689fa7e 100644 --- a/onnx_neural_compressor/quantization/algorithm_entry.py +++ b/onnx_neural_compressor/quantization/algorithm_entry.py @@ -17,11 +17,148 @@ from typing import Union import onnx -from onnxruntime import quantization +import onnxruntime as ort -from onnx_neural_compressor import config, constants, data_reader, logger, utility +from onnx_neural_compressor import constants, data_reader, logger, utility +from onnx_neural_compressor.algorithms.post_training_quant import calibrate, quantizer from onnx_neural_compressor.algorithms.smoother import core from onnx_neural_compressor.algorithms.weight_only import awq, gptq, rtn +from onnx_neural_compressor.quantization import config + + +###################### RTN Algo Entry ################################## +@utility.register_algo(name=constants.RTN) +def rtn_quantize_entry( + model: Union[pathlib.Path, str], quant_config: config.RTNConfig, *args, **kwargs +) -> onnx.ModelProto: + """The main entry to apply rtn quantization.""" + if len(quant_config.config_mapping) == 0: + # map config to each op + model_info = config.RTNConfig.get_model_info(model=model) + config_mapping = quant_config.to_config_mapping(model_info=model_info) + logger.debug(config_mapping) + else: + config_mapping = quant_config.config_mapping + quant_kwargs = {} + quant_kwargs = {key: getattr(quant_config, key) for key in config.RTNConfig.model_params_list} + model = rtn.apply_rtn_on_model(model, config_mapping, **quant_kwargs) + return model + + +###################### GPTQ Algo Entry ################################## +@utility.register_algo(name=constants.GPTQ) +def gptq_quantize_entry( + model: Union[pathlib.Path, str], + quant_config: config.GPTQConfig, + calibration_data_reader: data_reader.CalibrationDataReader, + *args, + **kwargs, +) -> onnx.ModelProto: + """The main entry to apply gptq quantization.""" + assert calibration_data_reader is not None, "Please provide calibration_data_reader" + assert isinstance( + calibration_data_reader, data_reader.CalibrationDataReader + ), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader" + + if len(quant_config.config_mapping) == 0: + # map config to each op + model_info = config.GPTQConfig.get_model_info(model=model) + config_mapping = quant_config.to_config_mapping(model_info=model_info) + logger.debug(config_mapping) + else: + config_mapping = quant_config.config_mapping + quant_kwargs = {} + quant_kwargs = {key: getattr(quant_config, key) for key in config.GPTQConfig.model_params_list} + + # regenerate to ensure data exists + calibration_data_reader.rewind() + model = gptq.apply_gptq_on_model(model, config_mapping, calibration_data_reader, **quant_kwargs) + return model + + +###################### AWQ Algo Entry ################################## +@utility.register_algo(name=constants.AWQ) +def awq_quantize_entry( + model: Union[pathlib.Path, str], + quant_config: config.AWQConfig, + calibration_data_reader: data_reader.CalibrationDataReader, + *args, + **kwargs, +) -> onnx.ModelProto: + """The main entry to apply awq quantization.""" + assert calibration_data_reader is not None, "Please provide calibration_data_reader" + assert isinstance( + calibration_data_reader, data_reader.CalibrationDataReader + ), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader" + + if len(quant_config.config_mapping) == 0: + # map config to each op + model_info = config.AWQConfig.get_model_info(model=model) + config_mapping = quant_config.to_config_mapping(model_info=model_info) + logger.debug(config_mapping) + else: + config_mapping = quant_config.config_mapping + quant_kwargs = {} + quant_kwargs = {key: getattr(quant_config, key) for key in config.AWQConfig.model_params_list} + + # regenerate to ensure data exists + calibration_data_reader.rewind() + model = awq.apply_awq_on_model(model, config_mapping, calibration_data_reader, **quant_kwargs) + return model + + +###################### Static quant Entry ################################## +@utility.register_algo(name=constants.STATIC_QUANT) +def static_quantize_entry( + model: Union[pathlib.Path, str], + quant_config: config.StaticQuantConfig, + calibration_data_reader: data_reader.CalibrationDataReader, + model_output: Union[pathlib.Path, str] = None, + *args, + **kwargs, +) -> onnx.ModelProto: + """The main entry to apply dynamic quantization.""" + if len(quant_config.op_types_to_quantize) == 0: + logger.warning("No candidate op type to do quantization, exit.") + exit(0) + assert calibration_data_reader is not None, "Please provide calibration_data_reader" + assert isinstance( + calibration_data_reader, data_reader.CalibrationDataReader + ), "Please follow onnx_neural_compressor/quantization/calibrate.py to implement calibration_data_reader" + + if len(quant_config.config_mapping) == 0: + # map config to each op + model_info = config.StaticQuantConfig.get_model_info(model=model) + config_mapping = quant_config.to_config_mapping(model_info=model_info) + logger.debug(config_mapping) + else: + config_mapping = quant_config.config_mapping + + calibration_data_reader.rewind() + augment = calibrate.ONNXRTAugment( + model, + calibration_data_reader, + dump_op_types=quant_config.op_types_to_quantize, + execution_provider=quant_config.execution_provider, + iterations=list(range(0, quant_config.calibration_sampling_size)), + ) + min_max = augment.dump_minmax(config_mapping) + quantize_params = augment.dump_calibration(config_mapping, min_max=min_max) + _quantizer = quantizer.StaticQuantizer( + model, + config_mapping, + quant_format=quant_config.quant_format.name.lower(), + quantization_params=quantize_params, + op_types_to_quantize=quant_config.op_types_to_quantize, + execution_provider=quant_config.execution_provider, + optypes_to_exclude_output_quant=quant_config.optypes_to_exclude_output_quant, + dedicated_qdq_pair=quant_config.dedicated_qdq_pair, + add_qdq_pair_to_weight=quant_config.add_qdq_pair_to_weight, + ) + _quantizer.quantize_model() + if model_output is not None: + _quantizer.model.save(model_output) + return _quantizer.model.model ###################### SmoothQuant Entry ################################## @@ -32,7 +169,7 @@ def smooth_quant_entry( calibration_data_reader: data_reader.CalibrationDataReader, model_output: Union[pathlib.Path, str] = None, *args, - **kwargs + **kwargs, ) -> Union[pathlib.Path, str, onnx.ModelProto]: """Apply smooth quant.""" assert calibration_data_reader is not None, "Please provide calibration_data_reader" @@ -45,7 +182,7 @@ def smooth_quant_entry( smoother = core.Smoother( model, calibration_data_reader, - providers=quant_config.providers, + execution_provider=getattr(quant_config, "execution_provider", "CPUExecutionProvider"), ) smoothed_model = smoother.transform(**quant_config.to_dict()) with tempfile.TemporaryDirectory(prefix="ort.quant.") as tmp_dir: @@ -65,80 +202,45 @@ def smooth_quant_entry( # exclude Mul operations which are inserted during smooth operation excluded_nodes = [i.name for i in smoothed_model.graph.node if i.name.endswith("_smooth_mul")] - quant_config.calibration_data_reader = calibration_data_reader quant_config.nodes_to_exclude.extend(excluded_nodes) - quant_config.convert_to_ort_config() - quantization.quantize( + + q_model = static_quantize_entry( pathlib.Path(tmp_dir).joinpath("smooth.onnx").as_posix(), - model_output or pathlib.Path(tmp_dir).joinpath("quant_model.onnx").as_posix(), quant_config, + calibration_data_reader, + model_output, ) - model = model_output or onnx.load(pathlib.Path(tmp_dir).joinpath("quant_model.onnx").as_posix()) - - return model - - -###################### RTN Algo Entry ################################## -@utility.register_algo(name=constants.RTN) -def rtn_quantize_entry( - model: Union[pathlib.Path, str], quant_config: config.RTNConfig, *args, **kwargs -) -> onnx.ModelProto: - """The main entry to apply rtn quantization.""" - # map config to each op - model_info = quant_config.get_model_info(model=model) - configs_mapping = quant_config.to_config_mapping(model_info=model_info) - logger.debug(configs_mapping) - model = rtn.apply_rtn_on_model(model, configs_mapping) - return model - - -###################### GPTQ Algo Entry ################################## -@utility.register_algo(name=constants.GPTQ) -def gptq_quantize_entry( - model: Union[pathlib.Path, str], - quant_config: config.GPTQConfig, - calibration_data_reader: data_reader.CalibrationDataReader, - *args, - **kwargs -) -> onnx.ModelProto: - """The main entry to apply gptq quantization.""" - assert calibration_data_reader is not None, "Please provide calibration_data_reader" - assert isinstance( - calibration_data_reader, data_reader.CalibrationDataReader - ), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader" + return q_model - # map config to each op - model_info = quant_config.get_model_info(model=model) - configs_mapping = quant_config.to_config_mapping(model_info=model_info) - logger.debug(configs_mapping) - # regenerate to ensure data exists - calibration_data_reader.rewind() - model = gptq.apply_gptq_on_model(model, configs_mapping, calibration_data_reader) - return model - - -###################### AWQ Algo Entry ################################## -@utility.register_algo(name=constants.AWQ) -def awq_quantize_entry( +###################### Dynamic quant Entry ################################## +@utility.register_algo(name=constants.DYNAMIC_QUANT) +def dynamic_quantize_entry( model: Union[pathlib.Path, str], - quant_config: config.AWQConfig, - calibration_data_reader: data_reader.CalibrationDataReader, + quant_config: config.DynamicQuantConfig, + model_output: Union[pathlib.Path, str] = None, *args, - **kwargs + **kwargs, ) -> onnx.ModelProto: - """The main entry to apply awq quantization.""" - assert calibration_data_reader is not None, "Please provide calibration_data_reader" - assert isinstance( - calibration_data_reader, data_reader.CalibrationDataReader - ), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader" - - # map config to each op - model_info = quant_config.get_model_info(model=model) - configs_mapping = quant_config.to_config_mapping(model_info=model_info) - logger.debug(configs_mapping) - - # regenerate to ensure data exists - calibration_data_reader.rewind() - model = awq.apply_awq_on_model(model, configs_mapping, calibration_data_reader) - return model + """The main entry to apply dynamic quantization.""" + if len(quant_config.op_types_to_quantize) == 0: + logger.warning("No candidate op type to do quantization, exit.") + exit(0) + + if len(quant_config.config_mapping) == 0: + # map config to each op + model_info = config.DynamicQuantConfig.get_model_info(model=model) + config_mapping = quant_config.to_config_mapping(model_info=model_info) + logger.debug(config_mapping) + else: + config_mapping = quant_config.config_mapping + + _quantizer = quantizer.DynamicQuantizer( + model, + config_mapping, + op_types_to_quantize=quant_config.op_types_to_quantize, + ) + _quantizer.quantize_model() + if model_output is not None: + _quantizer.model.save(model_output) + return _quantizer.model.model diff --git a/onnx_neural_compressor/quantization/calibrate.py b/onnx_neural_compressor/quantization/calibrate.py deleted file mode 100644 index 37bf7d671..000000000 --- a/onnx_neural_compressor/quantization/calibrate.py +++ /dev/null @@ -1,32 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import abc - -from onnxruntime import quantization - - -class CalibrationDataReader(quantization.CalibrationDataReader): - """Get data for calibration. - - We define our CalibrationDataReader based on the class in below link: - https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py#L139 - """ - - @abc.abstractmethod - def rewind(self): - """Regenerate data.""" - raise NotImplementedError diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py new file mode 100644 index 000000000..5b8dcc178 --- /dev/null +++ b/onnx_neural_compressor/quantization/config.py @@ -0,0 +1,2249 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import copy +import dataclasses +import enum +import inspect +import itertools +import json +import os +import pathlib +import re +from abc import ABC, abstractmethod + +import numpy as np +import onnx +import pydantic +from onnxruntime import quantization as ort_quant +from typing_extensions import Self + +from onnx_neural_compressor import constants, data_reader, logger, quantization, utility + +from collections import OrderedDict # isort: skip +from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, _GenericAlias # isort: skip + + +class ParamLevel(enum.Enum): + OP_LEVEL = enum.auto() + OP_TYPE_LEVEL = enum.auto() + MODEL_LEVEL = enum.auto() + + +class TuningParam: + """Define the tunable parameter for the algorithm. + + Example: + Class FakeAlgoConfig(config.BaseConfig): + '''Fake algo config.'''. + + params_list = [ + ... + # For simple tunable types, like a list of int, giving + # the param name is enough. `config.BaseConfig` class will + # create the `TuningParam` implicitly. + "simple_attr" + + # For complex tunable types, like a list of lists, + # developers need to create the `TuningParam` explicitly. + TuningParam("complex_attr", tunable_type=List[List[str]]) + + # The default parameter level is `ParamLevel.OP_LEVEL`. + # If the parameter is at a different level, developers need + # to specify it explicitly. + TuningParam("model_attr", level=ParamLevel.MODEL_LEVEL) + + ... + + # TODO: more examples to explain the usage of `TuningParam`. + """ + + def __init__( + self, + name: str, + default_val: Any = None, + tunable_type=None, + options=None, + level: ParamLevel = ParamLevel.OP_LEVEL, + ) -> None: + self.name = name + self.default_val = default_val + self.tunable_type = tunable_type + self.options = options + self.level = level + + @staticmethod + def create_input_args_model(expect_args_type: Any) -> type: + """Dynamically create an InputArgsModel based on the provided type hint. + + Parameters: + - expect_args_type (Any): The user-provided type hint for input_args. + + Returns: + - type: The dynamically created InputArgsModel class. + """ + + class DynamicInputArgsModel(pydantic.BaseModel): + input_args: expect_args_type + + return DynamicInputArgsModel + + def is_tunable(self, value: Any) -> bool: + # Use `Pydantic` to validate the input_args. + # TODO: refine the implementation in further. + assert isinstance(self.tunable_type, _GenericAlias), f"Expected a type hint, got {self.tunable_type} instead." + DynamicInputArgsModel = TuningParam.create_input_args_model(self.tunable_type) + try: + new_args = DynamicInputArgsModel(input_args=value) + return True + except Exception as e: + logger.debug(f"Failed to validate the input_args: {e}") + return False + + def __str__(self) -> str: + return "TuningParam(name={}, tunable_type={}, options={}).".format( + self.name, str(self.tunable_type), str(self.options) + ) + + +# Config registry to store all registered configs. +class ConfigRegistry(object): + registered_configs = {} + _config_registry = None + + def __new__(cls) -> Self: + if cls._config_registry is None: + cls._config_registry = super(ConfigRegistry, cls).__new__(cls) + + return cls._config_registry + + @classmethod + def register_config_impl(cls, algo_name: str, priority: Union[float, int] = 0): + """Register config decorator. + + The register the configuration classes for different algorithms. + + Usage example: + @ConfigRegistry.register_config(algo_name=ExampleAlgorithm, priority=100) + class ExampleAlgorithmConfig: + # Configuration details for the ExampleAlgorithm + + Args: + algo_name: the algorithm name. + priority: priority: the priority of the configuration. A larger number indicates a higher priority, + which will be tried first at the auto-tune stage. Defaults to 0. + """ + + def decorator(config_cls): + cls.registered_configs[algo_name] = {"priority": priority, "cls": config_cls} + return config_cls + + return decorator + + @classmethod + def get_all_configs(cls) -> Dict[str, Dict[str, Dict[str, object]]]: + """Get all registered configurations.""" + return cls.registered_configs + + @classmethod + def get_sorted_configs(cls) -> Dict[str, OrderedDict[str, Dict[str, object]]]: + """Get registered configurations sorted by priority.""" + return OrderedDict(sorted(cls.registered_configs.items(), key=lambda x: x[1]["priority"], reverse=True)) + + @classmethod + def get_cls_configs(cls) -> Dict[str, Dict[str, object]]: + """Get registered configurations without priority.""" + cls_configs = {} + for algo_name, config_data in cls.registered_configs.items(): + cls_configs[algo_name] = config_data["cls"] + return cls_configs + + @classmethod + def get_all_config_cls(cls) -> List[Type[BaseConfig]]: + configs_cls = [] + for algo_name, config_pairs in cls.registered_configs.items(): + configs_cls.append(config_pairs["cls"]) + return configs_cls + + +config_registry = ConfigRegistry() + + +def register_config(algo_name: str, priority: Union[float, int] = 0): + """Register config decorator. + + The register the configuration classes for different algorithms. + + Usage example: + @register_config(algo_name=ExampleAlgorithm, priority=100) + class ExampleAlgorithmConfig: + # Configuration details for the ExampleAlgorithm + + Args: + algo_name: the algorithm name. + priority: the priority of the configuration. A larger number indicates a higher priority, + which will be tried first at the auto-tune stage. Defaults to 0. + """ + + return config_registry.register_config_impl(algo_name=algo_name, priority=priority) + + +class BaseConfig(ABC): + """The base config for all algorithm configs.""" + + name = constants.BASE_CONFIG + params_list: List[Union[str, TuningParam]] = [] + model_params_list: List[Union[str, TuningParam]] = [] + + def __init__( + self, + white_list: Optional[Union[Union[str, Callable], List[Union[str, Callable]]]] = constants.DEFAULT_WHITE_LIST, + ) -> None: + self._global_config: Optional[BaseConfig] = None + # local config is the collections of operator_type configs and operator configs + self._local_config: Dict[str, Optional[BaseConfig]] = {} + self._white_list = white_list + self._config_mapping = OrderedDict() + + def _post_init(self): + if self.white_list == constants.DEFAULT_WHITE_LIST: + global_config = self.get_init_args() + self._global_config = self.__class__(**global_config, white_list=None) + elif isinstance(self.white_list, list) and len(self.white_list) > 0: + for op_name_or_type in self.white_list: + global_config = self.get_init_args() + tmp_config = self.__class__(**global_config, white_list=None) + self.set_local(op_name_or_type, tmp_config) + elif self.white_list == constants.EMPTY_WHITE_LIST: + return + else: + raise NotImplementedError( + f"The white list should be one of {constants.DEFAULT_WHITE_LIST}, {constants.EMPTY_WHITE_LIST}," + " a not empty list, but got {self.white_list}" + ) + + @property + def config_mapping(self): + return self._config_mapping + + @property + def white_list(self): + return self._white_list + + @white_list.setter + def white_list(self, op_name_or_type_list: Optional[List[Union[str, Callable]]]): + self._white_list = op_name_or_type_list + + @property + def global_config(self): + return self._global_config + + @global_config.setter + def global_config(self, config): + self._global_config = config + + @property + def local_config(self): + return self._local_config + + @local_config.setter + def local_config(self, config): + self._local_config = config + + def set_local(self, operator_name: str, config: BaseConfig) -> BaseConfig: + if operator_name in self.local_config and config != self.local_config[operator_name]: + logger.debug("The configuration for %s has already been set, update it.", operator_name) + self.local_config[operator_name] = config + return self + + def to_dict(self): + result = {} + global_config = self.get_init_args() + if bool(self.local_config): + result[constants.LOCAL] = {} + for op_name, config in self.local_config.items(): + result[constants.LOCAL][op_name] = config.to_dict() + if global_config: + result[constants.GLOBAL] = global_config + else: + result = global_config + return result + + def get_params_dict(self): + result = dict() + for param, value in self.__dict__.items(): + if param in self.params_list: + result[param] = value + return result + + def get_init_args(self): + result = dict() + for param, value in self.__dict__.items(): + if param not in ["_global_config", "_local_config", "_white_list", "_config_mapping"]: + result[param] = value + return result + + def __getitem__(self, key): + if hasattr(self, key): + return getattr(self, key) + else: + raise KeyError(f"No such attribute: {key}") + + def __setitem__(self, key, value): + setattr(self, key, value) + + @classmethod + def from_dict(cls, config_dict): + """Construct config from a dict. + + Args: + config_dict: _description_ + + Returns: + The constructed config. + """ + if constants.GLOBAL not in config_dict and constants.LOCAL not in config_dict: + config = cls(**config_dict) + return config + else: + config = cls(**config_dict.get(constants.GLOBAL, {})) + operator_config = config_dict.get(constants.LOCAL, {}) + if operator_config: + for op_name, op_config in operator_config.items(): + config.set_local(op_name, cls(**op_config, white_list=None)) + return config + + def get_diff_dict(self, config) -> Dict[str, Any]: + """Get the difference between current config and user-specific config.""" + diff_cfg = {} + for name, cfg in self.get_init_args().items(): + if hasattr(config, name): + if isinstance(cfg, BaseConfig) and isinstance(config[name], BaseConfig): + diff_cfg[name] = cfg.get_diff_dict(config[name]) + elif cfg != config[name]: + diff_cfg[name] = cfg + else: + diff_cfg[name] = cfg + return diff_cfg + + @classmethod + def from_json_file(cls, filename): + with open(filename, "r", encoding="utf-8") as file: + config_dict = json.load(file) + return cls.from_dict(**config_dict) + + def to_json_file(self, filename): + config_dict = self.to_dict() + with open(filename, "w", encoding="utf-8") as file: + json.dump(config_dict, file, indent=4) + logger.info("Dump the config into %s.", filename) + + def to_json_string(self, use_diff: bool = False) -> Union[str, Dict]: + """Serializes this instance to a JSON string. + + Args: + use_diff (`bool`, *optional*, defaults to `True`): + If set to `True`, only the difference between the config instance and the default `BaseConfig()` + is serialized to JSON string. + + Returns: + `str`: String containing all the attributes that make up this configuration instance in JSON format. + """ + if use_diff is True: + config_dict = self.to_diff_dict(self) + else: + config_dict = self.to_dict() + try: + return json.dumps(config_dict, indent=2) + "\n" + except Exception as e: + logger.error("Failed to serialize the config to JSON string: %s", e) + return config_dict + + def __repr__(self) -> str: + return f"{self.__class__.__name__} {self.to_json_string()}" + + @classmethod + @abstractmethod + def register_supported_configs(cls): + """Add all supported configs.""" + raise NotImplementedError + + @classmethod + def validate(self, user_config: BaseConfig): + # TODO validate the user config + pass + + def __add__(self, other: BaseConfig) -> BaseConfig: + if isinstance(other, type(self)): + for op_name, config in other.local_config.items(): + self.set_local(op_name, config) + return self + else: + return ComposableConfig(configs=[self, other]) + + @staticmethod + def get_the_default_value_of_param(config: BaseConfig, param: str) -> Any: + # Get the signature of the __init__ method + signature = inspect.signature(config.__init__) + + # Get the parameters and their default values + parameters = signature.parameters + return parameters.get(param).default if parameters.get(param) is not None else None + + @staticmethod + def build_tuning_param(config: BaseConfig, param: str): + # Create `tuning.TuningParam` for each param + # There are two cases: + # 1. The param is a string. + # 2. The param is a `tuning.TuningParam` instance. + if isinstance(param, str): + signature = inspect.signature(config.__init__) + parameters = signature.parameters + default_param = parameters.get(param).default if parameters.get(param) is not None else None + tuning_param = TuningParam(name=param, tunable_type=List[type(default_param)]) + elif isinstance(param, TuningParam): + tuning_param = param + else: + raise ValueError(f"Unsupported param type: {param}") + return tuning_param + + def expand(self) -> List[BaseConfig]: + """Expand the config. + + Expand rule is: + 1. Expand model_params_list first, then expand params_list + 2. Expand model_params_list/params_list following the order of param order in model_params_list/params_list + + model_params_list=[A, B] params_list=[C,D] + A=[1,2], B=[3,4] C=[5,6], D=[7,8] + + Expanded results: + -------- Combination 1 (C=5, D=7) + / + / -------- Combination 2 (C=6, D=7) + Combination 1 ---- + (A=1, B=3) \ -------- Combination 3 (C=5, D=8) + \ + -------- Combination 4 (C=6, D=8) + + -------- Combination 1 (C=5, D=7) + / + / -------- Combination 2 (C=6, D=7) + Combination 2 ---- + (A=2, B=3) \ -------- Combination 3 (C=5, D=8) + \ + -------- Combination 4 (C=6, D=8) + + -------- Combination 1 (C=5, D=7) + / + / -------- Combination 2 (C=6, D=7) + Combination 3 ---- + (A=1, B=4) \ -------- Combination 3 (C=5, D=8) + \ + -------- Combination 4 (C=6, D=8) + + -------- Combination 1 (C=5, D=7) + / + / -------- Combination 2 (C=6, D=7) + Combination 4 ---- + (A=2, B=4) \ -------- Combination 3 (C=5, D=8) + \ + -------- Combination 4 (C=6, D=8) + """ + config = self + # set model level params + model_level_config_lst: List[BaseConfig] = [] + model_params_list = getattr(self, "model_params_list", []) + tuning_param_list = [] + for param in model_params_list: + tuning_param = self.build_tuning_param(config, param) + param_val = getattr(config, tuning_param.name) + if param_val is not None: + if tuning_param.is_tunable(param_val): + tuning_param.options = param_val + tuning_param_list.append(tuning_param) + + if len(tuning_param_list) == 0: + model_level_config_lst = [config] + else: + tuning_param_name_lst = [tuning_param.name for tuning_param in tuning_param_list] + for params_values in itertools.product(*[tuning_param.options for tuning_param in tuning_param_list[::-1]]): + new_config = copy.deepcopy(self) + for param_name, param_value in zip(tuning_param_name_lst[::-1], params_values): + setattr(new_config, param_name, param_value) + logger.debug(new_config.to_dict()) + model_level_config_lst.append(new_config) + + # set op level params + op_params_list = self.params_list + op_tuning_param_list = [] + local_op_level_config_lst = [] + + for param in op_params_list: + tuning_param = self.build_tuning_param(config, param) + param_val = getattr(config, tuning_param.name) + if param_val is not None: + if tuning_param.is_tunable(param_val) and len(param_val) > 0: + tuning_param.options = param_val + op_tuning_param_list.append(tuning_param) + + if len(op_tuning_param_list) == 0: + local_op_level_config_lst = model_level_config_lst + else: + tuning_param_name_lst = [tuning_param.name for tuning_param in op_tuning_param_list] + tuning_param_val_lst = list( + itertools.product(*[tuning_param.options for tuning_param in op_tuning_param_list[::-1]]) + ) + tuning_param_pair_lst = [dict(zip(tuning_param_name_lst[::-1], val)) for val in tuning_param_val_lst] + + for model_level_config in model_level_config_lst: + for tuning_param_pair in tuning_param_pair_lst: + new_config = copy.deepcopy(model_level_config) + for name, val in tuning_param_pair.items(): + setattr(new_config, name, val) + for _, cfg in new_config.local_config.items(): + if isinstance(getattr(cfg, name, None), list) and val in getattr(cfg, name, None): + setattr(cfg, name, val) + logger.debug(new_config.to_dict()) + local_op_level_config_lst.append(new_config) + + logger.info("Expanded the %s and got %d configs.", self.__class__.name, len(local_op_level_config_lst)) + return local_op_level_config_lst + + def _get_op_name_op_type_config(self): + op_type_config_dict = dict() + op_name_config_dict = dict() + for name, config in self.local_config.items(): + if self._is_op_type(name): + op_type_config_dict[name] = config + else: + op_name_config_dict[name] = config + return op_type_config_dict, op_name_config_dict + + def to_config_mapping( + self, config_list: Optional[List[BaseConfig]] = None, model_info: List[Tuple[str, str]] = None + ) -> OrderedDict[Tuple[str, str], OrderedDict[str, BaseConfig]]: + if config_list is None: + config_list = [self] + for config in config_list: + op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() + for op_name, op_type in model_info: + if op_type in op_type_config_dict: + self._config_mapping[op_name] = op_name_config_dict[op_type] + for op_name_pattern in op_name_config_dict: + if isinstance(op_name, str) and re.match(op_name_pattern, op_name): + self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] + elif op_name_pattern == op_name: + self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] + return self._config_mapping + + @staticmethod + def _is_op_type(name: str) -> bool: + return name in constants.STATIC_QOPERATOR_CPU_OP_LIST or name in constants.DYNAMIC_CPU_OP_LIST + + @classmethod + @abstractmethod + def get_config_set_for_tuning(cls): + raise NotImplementedError + + def __eq__(self, other: BaseConfig) -> bool: + if not isinstance(other, type(self)): + return False + return self.get_init_args() == other.get_init_args() + + +class ComposableConfig(BaseConfig): + name = constants.COMPOSABLE_CONFIG + + def __init__(self, configs: List[BaseConfig]) -> None: + self.config_list = configs + self._config_mapping = OrderedDict() + + def __add__(self, other: BaseConfig) -> BaseConfig: + if isinstance(other, type(self)): + self.config_list.extend(other.config_list) + else: + self.config_list.append(other) + return self + + def to_dict(self): + result = {} + for config in self.config_list: + result[config.name] = config.to_dict() + return result + + @classmethod + def from_dict(cls, config_dict: OrderedDict[str, Dict], config_registry: Dict[str, BaseConfig]): + assert len(config_dict) >= 1, "The config dict must include at least one configuration." + num_configs = len(config_dict) + name, value = next(iter(config_dict.items())) + config = config_registry[name].from_dict(value) + for _ in range(num_configs - 1): + name, value = next(iter(config_dict.items())) + config += config_registry[name].from_dict(value) + return config + + def to_json_string(self, use_diff: bool = False) -> str: + return json.dumps(self.to_dict(), indent=2) + "\n" + + def __repr__(self) -> str: + return f"{self.__class__.__name__} {self.to_json_string()}" + + def to_config_mapping( + self, config_list: List[BaseConfig] = None, model_info: Dict[str, Any] = None + ) -> OrderedDict[str, BaseConfig]: + for config in self.config_list: + op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() + single_config_model_info = model_info.get(config.name, None) + for op_name, op_type in single_config_model_info: + if op_type in op_type_config_dict: + self._config_mapping[op_name] = op_name_config_dict[op_type] + for op_name_pattern in op_name_config_dict: + if re.match(op_name_pattern, op_name): + self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] + return self._config_mapping + + @classmethod + def register_supported_configs(cls): + """Add all supported configs.""" + raise NotImplementedError + + @classmethod + def get_config_set_for_tuning(cls) -> None: + # TODO handle the composable config in `tuning_config` + return None + + def get_model_info(self, model, *args, **kwargs): + model_info_dict = dict() + for config in self.config_list: + model_info_dict.update({config.name: config.get_model_info(model, *args, **kwargs)}) + return model_info_dict + + +def get_all_config_set_from_config_registry() -> List[BaseConfig]: + all_registered_config_cls: List[Type[BaseConfig]] = config_registry.get_all_config_cls() + config_set = [] + for config_cls in all_registered_config_cls: + config_set.append(config_cls.get_config_set_for_tuning()) + return config_set + + +def register_supported_configs(): + """Register supported configs.""" + all_registered_config_cls: List[Type[BaseConfig]] = config_registry.get_all_config_cls() + for config_cls in all_registered_config_cls: + config_cls.register_supported_configs() + + +@dataclasses.dataclass +class OperatorConfig: + weight_type: quantization.QuantType + activation_type: quantization.QuantType + per_channel: bool + weight_sym: bool + activation_sym: bool + calibrate_method: quantization.CalibrationMethod = quantization.CalibrationMethod.MinMax + + def __post_init__(self): + self.weight_type = getattr(self.weight_type, "tensor_type", self.weight_type) + self.activation_type = getattr(self.activation_type, "tensor_type", self.activation_type) + self.calibrate_method = getattr(self.calibrate_method, "name", self.calibrate_method) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + setattr(self, key, value) + + def __contains__(self, key): + return hasattr(self, key) + + def update(self, kwargs): + self.weight_type = kwargs.get("weight_type", self.weight_type) + self.activation_type = kwargs.get("activation_type", self.activation_type) + self.per_channel = kwargs.get("per_channel", self.per_channel) + self.weight_sym = kwargs.get("weight_sym", self.weight_sym) + self.calibrate_method = kwargs.get("calibrate_method", self.calibrate_method) + + def to_dict(self): + result = {} + for key, val in self.__dict__.items(): + if not isinstance(val, list): + result[key] = ( + getattr(val, "tensor_type", val) + if isinstance(val, quantization.QuantType) + else getattr(val, "value", val) + ) + else: + result[key] = [ + ( + getattr(item, "tensor_type", item) + if isinstance(item, quantization.QuantType) + else getattr(item, "value", item) + ) + for item in val + ] + return result + + def __eq__(self, other): + if isinstance(other, OperatorConfig): + return self.to_dict() == other.to_dict() + else: + return self.to_dict() == other + + +class _OperatorConfig(NamedTuple): + config: OperatorConfig + operators: List[Union[str, Callable]] + valid_func_list: List[Callable] = [] + + +######################## RNT Config ############################### + + +@register_config(algo_name=constants.RTN, priority=constants.PRIORITY_RTN) +class RTNConfig(BaseConfig): + """Config class for round-to-nearest weight-only quantization.""" + + supported_configs: List[_OperatorConfig] = [] + params_list: List[Union[str, TuningParam]] = [ + "weight_dtype", + "weight_bits", + "weight_group_size", + "weight_sym", + "act_dtype", + "accuracy_level", + "ratios", + ] + model_params_list: List[str] = [ + "providers", + "layer_wise_quant", + ] + name: str = constants.RTN + + def __init__( + self, + weight_dtype: str = "int", + weight_bits: int = 4, + weight_group_size: int = 32, + weight_sym: bool = True, + act_dtype: str = "fp32", + accuracy_level: int = 0, + ratios: dict = {}, + providers: List[str] = ["CPUExecutionProvider"], + layer_wise_quant: bool = False, + quant_last_matmul: bool = True, + white_list: List[Union[str, Callable]] = constants.RTN_OP_LIST, + ): + """Init RTN weight-only quantization config. + + Args: + weight_dtype (str, optional): Data type for weights, default is "int". + weight_bits (int, optional): Number of bits used to represent weights, default is 4. + weight_group_size (int, optional): Size of weight groups, default is 32. + weight_sym (bool, optional): Indicates whether weights are symmetric, default is True. + act_dtype (str, optional): Data type for activations, default is "fp32". + accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel). Defaults to 0. + ratios (dict, optional): percentile of clip. Defaults to {}. + providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"]. + layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint. + Check below link for details + https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md, + default is False. + quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True. + white_list (list, optional): op in white_list will be applied current config. + Defaults to constants.DEFAULT_WHITE_LIST. + """ + super().__init__(white_list=white_list) + self.weight_bits = weight_bits + self.weight_dtype = weight_dtype + self.weight_group_size = weight_group_size + self.weight_sym = weight_sym + self.act_dtype = act_dtype + self.accuracy_level = accuracy_level + self.ratios = ratios + self.providers = providers + self.layer_wise_quant = layer_wise_quant + self.quant_last_matmul = quant_last_matmul + self._post_init() + + def _post_init(self): + if self.white_list == constants.RTN_OP_LIST: + global_config = self.get_init_args() + self._global_config = self.__class__(**global_config, white_list=None) + elif isinstance(self.white_list, list) and len(self.white_list) > 0: + for op_name_or_type in self.white_list: + global_config = self.get_init_args() + tmp_config = self.__class__(**global_config, white_list=None) + self.set_local(op_name_or_type, tmp_config) + elif self.white_list == constants.EMPTY_WHITE_LIST: + return + + def get_model_params_dict(self): + result = dict() + for param in self.model_params_list: + result[param] = getattr(self, param) + return result + + @classmethod + def register_supported_configs(cls) -> None: + supported_configs = [] + linear_rtn_config = RTNConfig( + weight_dtype=["int"], + weight_bits=[1, 2, 3, 4, 5, 6, 7, 8], + weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024], + weight_sym=[True, False], + act_dtype=["fp32"], + ) + operators = constants.RTN_OP_LIST + supported_configs.append(_OperatorConfig(config=linear_rtn_config, operators=operators)) + cls.supported_configs = supported_configs + + def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: list = None): + if config_list is None: + config_list = [self] + for config in config_list: + # update model level setting + self._config_mapping.update(config.get_model_params_dict()) + + # update node level setting + last_matmul = None + global_config = config.get_params_dict() + op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() + for op_name, op_type in model_info: + if op_type == "MatMul": + last_matmul = op_name + if global_config is not None: + self._config_mapping[op_name] = global_config + if op_type in op_type_config_dict: + self._config_mapping[op_name] = op_type_config_dict[op_type] + for op_name_pattern in op_name_config_dict: + if re.match(op_name_pattern, op_name): + self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] + if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"): + self._config_mapping[op_name] = self._config_mapping[op_name].to_dict() + if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping: + del self._config_mapping[last_matmul] + return self._config_mapping + + @staticmethod + def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.RTN_OP_LIST) -> list: + if not isinstance(model, onnx.ModelProto): + model = onnx.load(model, load_external_data=False) + + filter_result = [] + for node in model.graph.node: + if node.op_type in white_list: + pair = (node.name, node.op_type) + filter_result.append(pair) + logger.debug(f"Get model info: {filter_result}") + return filter_result + + @classmethod + def get_config_set_for_tuning(cls) -> Union[None, "RTNConfig", List["RTNConfig"]]: # pragma: no cover + return RTNConfig(weight_bits=[4, 8], weight_sym=[True, False]) + + +def get_default_rtn_config() -> RTNConfig: + """Generate the default rtn config. + + Returns: + the default rtn config. + """ + return RTNConfig() + + +######################## GPTQ Config ############################### + + +@register_config(algo_name=constants.GPTQ, priority=constants.PRIORITY_GPTQ) +class GPTQConfig(BaseConfig): + """Config class for gptq weight-only quantization.""" + + supported_configs: List[_OperatorConfig] = [] + params_list: List[Union[str, TuningParam]] = [ + "weight_dtype", + "weight_bits", + "weight_group_size", + "weight_sym", + "act_dtype", + "accuracy_level", + ] + model_params_list: List[Union[str, TuningParam]] = [ + "percdamp", + "block_size", + "actorder", + "mse", + "perchannel", + "providers", + "layer_wise_quant", + ] + name: str = constants.GPTQ + + def __init__( + self, + weight_dtype: str = "int", + weight_bits: int = 4, + weight_group_size: int = 32, + weight_sym: bool = True, + act_dtype: str = "fp32", + accuracy_level: int = 0, + percdamp: float = 0.01, + block_size: int = 128, + actorder: bool = False, + mse: bool = False, + perchannel: bool = True, + providers: List[str] = ["CPUExecutionProvider"], + layer_wise_quant: bool = False, + quant_last_matmul: bool = True, + white_list: List[Union[str, Callable]] = constants.GPTQ_OP_LIST, + ): + """Init GPTQ weight-only quantization config. + + Args: + weight_dtype (str, optional): data type for weights. Defaults to "int". + weight_bits (int, optional): number of bits used to represent weights. Defaults to 4. + weight_group_size (int, optional): size of weight groups. Defaults to 32. + weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True. + act_dtype (str, optional): data type for activations. Defaults to "fp32". + accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel). Defaults to 0. + percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added + to Hessian's diagonal to increase numerical stability. Defaults to 0.01. + block_size (int, optional): execute GPTQ quantization per block. Defaults to 128. + actorder (bool, optional): whether to sort Hessian's diagonal values to rearrange channel-wise + quantization order. Defaults to False. + mse (bool, optional): whether get scale and zero point with mse error. Defaults to False. + perchannel (bool, optional): whether quantize weight per-channel. Defaults to True. + providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"]. + layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint. + Check below link for details + https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md, + default is False. + quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True. + white_list (list, optional): op in white_list will be applied current config. + Defaults to constants.DEFAULT_WHITE_LIST. + """ + super().__init__(white_list=white_list) + self.weight_bits = weight_bits + self.weight_dtype = weight_dtype + self.weight_group_size = weight_group_size + self.weight_sym = weight_sym + self.act_dtype = act_dtype + self.accuracy_level = accuracy_level + self.percdamp = percdamp + self.block_size = block_size + self.actorder = actorder + self.mse = mse + self.perchannel = perchannel + self.providers = providers + self.layer_wise_quant = layer_wise_quant + self.quant_last_matmul = quant_last_matmul + self._post_init() + + def _post_init(self): + if self.white_list == constants.GPTQ_OP_LIST: + global_config = self.get_init_args() + self._global_config = self.__class__(**global_config, white_list=None) + elif isinstance(self.white_list, list) and len(self.white_list) > 0: + for op_name_or_type in self.white_list: + global_config = self.get_init_args() + tmp_config = self.__class__(**global_config, white_list=None) + self.set_local(op_name_or_type, tmp_config) + elif self.white_list == constants.EMPTY_WHITE_LIST: + return + + def get_model_params_dict(self): + result = dict() + for param in self.model_params_list: + result[param] = getattr(self, param) + return result + + @classmethod + def register_supported_configs(cls) -> None: + supported_configs = [] + linear_gptq_config = GPTQConfig( + weight_dtype=["int"], + weight_bits=[1, 2, 3, 4, 5, 6, 7, 8], + weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024], + weight_sym=[True, False], + act_dtype=["fp32"], + actorder=[True, False], + mse=[True, False], + perchannel=[True, False], + ) + operators = constants.GPTQ_OP_LIST + supported_configs.append(_OperatorConfig(config=linear_gptq_config, operators=operators)) + cls.supported_configs = supported_configs + + def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict: + if config_list is None: + config_list = [self] + for config in config_list: + # update model level setting + self._config_mapping.update(config.get_model_params_dict()) + + # update node level setting + last_matmul = None + global_config = config.get_params_dict() + op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() + for op_name, op_type in model_info: + if op_type == "MatMul": + last_matmul = op_name + if global_config is not None: + self._config_mapping[op_name] = global_config + if op_type in op_type_config_dict: + self._config_mapping[op_name] = op_type_config_dict[op_type] + for op_name_pattern in op_name_config_dict: + if re.match(op_name_pattern, op_name): + self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] + if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"): + self._config_mapping[op_name] = self._config_mapping[op_name].to_dict() + if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping: + del self._config_mapping[last_matmul] + return self._config_mapping + + @staticmethod + def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.GPTQ_OP_LIST) -> list: + if not isinstance(model, onnx.ModelProto): + model = onnx.load(model, load_external_data=False) + + filter_result = [] + for node in model.graph.node: + if node.op_type in white_list: + pair = (node.name, node.op_type) + filter_result.append(pair) + logger.debug(f"Get model info: {filter_result}") + return filter_result + + @classmethod + def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig"]]: # pragma: no cover + return GPTQConfig( + weight_bits=[4, 8], + weight_sym=[True, False], + actorder=[True, False], + mse=[True, False], + perchannel=[True, False], + ) + + +def get_default_gptq_config() -> GPTQConfig: + """Generate the default gptq config. + + Returns: + the default gptq config. + """ + return GPTQConfig() + + +######################## AWQ Config ############################### + + +@register_config(algo_name=constants.AWQ, priority=constants.PRIORITY_AWQ) +class AWQConfig(BaseConfig): + """Config class for awq weight-only quantization.""" + + supported_configs: List[_OperatorConfig] = [] + params_list: List[str] = [ + "weight_dtype", + "weight_bits", + "weight_group_size", + "weight_sym", + "act_dtype", + "accuracy_level", + ] + model_params_list: List[str] = [ + "enable_auto_scale", + "enable_mse_search", + "providers", + ] + name: str = constants.AWQ + + def __init__( + self, + weight_dtype: str = "int", + weight_bits: int = 4, + weight_group_size: int = 32, + weight_sym: bool = True, + act_dtype: str = "fp32", + accuracy_level: int = 0, + enable_auto_scale: bool = True, + enable_mse_search: bool = True, + providers: List[str] = ["CPUExecutionProvider"], + quant_last_matmul: bool = True, + white_list: List[Union[str, Callable]] = constants.AWQ_OP_LIST, + ): + """Init AWQ weight-only quantization config. + + Args: + weight_dtype (str, optional): data type for weights. Defaults to "int". + weight_bits (int, optional): number of bits used to represent weights. Defaults to 4. + weight_group_size (int, optional): size of weight groups. Defaults to 32. + weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True. + act_dtype (str, optional): data type for activations. Defaults to "fp32". + accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel). Defaults to 0. + enable_auto_scale (bool, optional): whether to search for best scales based on activation distribution. + Defaults to True. + enable_mse_search (bool, optional): whether to search for the best clip range from range + [0.91, 1.0, 0.01]. Defaults to True. + providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"]. + quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True. + white_list (list, optional): op in white_list will be applied current config. + Defaults to constants.DEFAULT_WHITE_LIST. + """ + super().__init__(white_list=white_list) + self.weight_bits = weight_bits + self.weight_dtype = weight_dtype + self.weight_group_size = weight_group_size + self.weight_sym = weight_sym + self.act_dtype = act_dtype + self.accuracy_level = accuracy_level + self.enable_auto_scale = enable_auto_scale + self.enable_mse_search = enable_mse_search + self.providers = providers + self.quant_last_matmul = quant_last_matmul + self._post_init() + + def _post_init(self): + if self.white_list == constants.GPTQ_OP_LIST: + global_config = self.get_init_args() + self._global_config = self.__class__(**global_config, white_list=None) + elif isinstance(self.white_list, list) and len(self.white_list) > 0: + for op_name_or_type in self.white_list: + global_config = self.get_init_args() + tmp_config = self.__class__(**global_config, white_list=None) + self.set_local(op_name_or_type, tmp_config) + elif self.white_list == constants.EMPTY_WHITE_LIST: + return + + def get_model_params_dict(self): + result = dict() + for param in self.model_params_list: + result[param] = getattr(self, param) + return result + + @classmethod + def register_supported_configs(cls) -> List[_OperatorConfig]: + supported_configs = [] + linear_awq_config = AWQConfig( + weight_dtype=["int"], + weight_bits=[1, 2, 3, 4, 5, 6, 7, 8], + weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024], + weight_sym=[True, False], + act_dtype=["fp32"], + enable_auto_scale=[True, False], + enable_mse_search=[True, False], + ) + operators = constants.AWQ_OP_LIST + supported_configs.append(_OperatorConfig(config=linear_awq_config, operators=operators)) + cls.supported_configs = supported_configs + + def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict: + if config_list is None: + config_list = [self] + for config in config_list: + # update model level setting + self._config_mapping.update(config.get_model_params_dict()) + + # update node level setting + last_matmul = None + global_config = config.get_params_dict() + op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() + for op_name, op_type in model_info: + if op_type == "MatMul": + last_matmul = op_name + if global_config is not None: + self._config_mapping[op_name] = global_config + if op_type in op_type_config_dict: + self._config_mapping[op_name] = op_type_config_dict[op_type] + for op_name_pattern in op_name_config_dict: + if re.match(op_name_pattern, op_name): + self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] + if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"): + self._config_mapping[op_name] = self._config_mapping[op_name].to_dict() + if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping: + del self._config_mapping[last_matmul] + return self._config_mapping + + @staticmethod + def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.AWQ_OP_LIST) -> list: + if not isinstance(model, onnx.ModelProto): + model = onnx.load(model, load_external_data=False) + + filter_result = [] + for node in model.graph.node: + if node.op_type in white_list: + pair = (node.name, node.op_type) + filter_result.append(pair) + logger.debug(f"Get model info: {filter_result}") + return filter_result + + @classmethod + def get_config_set_for_tuning(cls) -> Union[None, "AWQConfig", List["AWQConfig"]]: # pragma: no cover + return AWQConfig( + weight_bits=[4, 8], + weight_sym=[True, False], + enable_auto_scale=[True, False], + enable_mse_search=[True, False], + ) + + +def get_default_awq_config() -> AWQConfig: + """Generate the default awq config. + + Returns: + the default awq config. + """ + return AWQConfig() + + +######################## WOQ Tuning Config ############################### + + +def get_woq_tuning_config() -> list: + """Generate the config set for WOQ tuning. + + Returns: + the list of WOQ quant config. + """ + RTN_G32ASYM = RTNConfig(weight_sym=False) + GPTQ_G32ASYM = GPTQConfig(weight_sym=False) + GPTQ_G32ASYM_DISABLE_LAST_MATMUL = GPTQConfig(weight_sym=False, quant_last_matmul=False) + GPTQ_G128ASYM = GPTQConfig(weight_group_size=128, weight_sym=False) + AWQ_G32ASYM = AWQConfig(weight_sym=False) + return [RTN_G32ASYM, GPTQ_G32ASYM, GPTQ_G32ASYM_DISABLE_LAST_MATMUL, GPTQ_G128ASYM, AWQ_G32ASYM] + + +##################### Config for ONNXRuntime-like user-facing API ############ + + +class ExtraOptions: + def __init__( + self, + ActivationSymmetric=False, + WeightSymmetric=True, + AddQDQPairToWeight=False, + OpTypesToExcludeOutputQuantization=[], + DedicatedQDQPair=False, + SmoothQuant=False, + SmoothQuantAlpha=0.5, + SmoothQuantFolding=True, + SmoothQuantOpTypes=["Gemm", "Conv", "MatMul", "FusedConv"], + SmoothQuantCalibIter=100, + SmoothQuantScalesPerOp=True, + **kwargs, + ): + self.ActivationSymmetric = ActivationSymmetric + self.WeightSymmetric = WeightSymmetric + self.AddQDQPairToWeight = AddQDQPairToWeight + self.OpTypesToExcludeOutputQuantization = OpTypesToExcludeOutputQuantization + self.DedicatedQDQPair = DedicatedQDQPair + self.SmoothQuant = SmoothQuant + self.SmoothQuantAlpha = SmoothQuantAlpha + self.SmoothQuantFolding = SmoothQuantFolding + self.SmoothQuantOpTypes = SmoothQuantOpTypes + self.SmoothQuantCalibIter = SmoothQuantCalibIter + self.SmoothQuantScalesPerOp = SmoothQuantScalesPerOp + + +def static_basic_check(config, optype, execution_provider, quant_format): + if getattr(quant_format, "value", quant_format) == 0: + if execution_provider not in constants.STATIC_QOPERATOR_OP_LIST_MAP: + raise ValueError( + "Unsupported execution_provider {}, only support {}.".format( + execution_provider, list(constants.STATIC_QOPERATOR_OP_LIST_MAP.keys()) + ) + ) + supported_optype = constants.STATIC_QOPERATOR_OP_LIST_MAP[execution_provider] + if optype not in supported_optype: + raise ValueError( + "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) + ) + elif getattr(quant_format, "value", quant_format) == 1: + if execution_provider not in constants.STATIC_QDQ_OP_LIST_MAP: + raise ValueError( + "Unsupported execution_provider {}, only support {}.".format( + execution_provider, list(constants.STATIC_QDQ_OP_LIST_MAP.keys()) + ) + ) + supported_optype = constants.STATIC_QDQ_OP_LIST_MAP[execution_provider] + if optype not in supported_optype: + raise ValueError( + "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) + ) + else: + raise ValueError( + "Unsupported quant_format {}, only support QuantFormat.QOperator and QuantFormat.QDQ.".format(quant_format) + ) + return config + + +def static_cpu_check(config, optype, execution_provider, quant_format): + if execution_provider != "CPUExecutionProvider": + return config + + # only support per-tensor + if optype in [ + "EmbedLayerNormalization", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Tile", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Attention", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", + "Add", + "Mul", + "ArgMax", + ]: + setattr(config, "per_channel", False) + + if optype in ["Attention"]: + setattr(config, "activation_type", onnx.TensorProto.UINT8) + return config + + +def static_cuda_check(config, optype, execution_provider, quant_format): + if execution_provider != "CUDAExecutionProvider": + return config + + # only support per-tensor + if optype in [ + "EmbedLayerNormalization", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Tile", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Attention", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", + "Add", + "Mul", + "ArgMax", + ]: + setattr(config, "per_channel", False) + + if optype in ["Attention"]: + setattr(config, "activation_type", onnx.TensorProto.INT8) + setattr(config, "weight_type", onnx.TensorProto.INT8) + return config + + +def static_dml_check(config, optype, execution_provider, quant_format): + if execution_provider != "DmlExecutionProvider": + return config + + # only support per-tensor + if optype in ["Conv", "MatMul", "Mul", "Relu", "Clip", "MaxPool", "Add"]: + setattr(config, "per_channel", False) + return config + + +def static_dnnl_check(config, optype, execution_provider, quant_format): + if execution_provider != "DnnlExecutionProvider": + return config + + # current configurations are same as CPU EP + return static_cpu_check(config, optype, execution_provider, quant_format) + + +def static_trt_check(config, optype, execution_provider, quant_format): + if execution_provider != "TensorrtExecutionProvider": + return config + + # only support S8S8 + if optype in ["Conv", "MatMul", "Gather", "Gemm"]: + setattr(config, "weight_type", onnx.TensorProto.INT8) + setattr(config, "weight_sym", True) + setattr(config, "activation_type", onnx.TensorProto.INT8) + setattr(config, "activation_sym", True) + setattr(config, "per_channel", [False, True]) + else: + setattr(config, "weight_type", onnx.TensorProto.INT8) + setattr(config, "weight_sym", True) + setattr(config, "activation_type", onnx.TensorProto.INT8) + setattr(config, "activation_sym", True) + return config + + +STATIC_CHECK_FUNC_LIST = [ + static_basic_check, + static_cpu_check, + static_cuda_check, + static_dml_check, + static_dnnl_check, + static_trt_check, +] + + +def dynamic_basic_check(config, optype, execution_provider, quant_format=None): + if execution_provider not in constants.DYNAMIC_OP_LIST_MAP: + raise ValueError( + "Unsupported execution_provider {}, only support {}.".format( + execution_provider, list(constants.DYNAMIC_OP_LIST_MAP.keys()) + ) + ) + + supported_optype = constants.DYNAMIC_OP_LIST_MAP[execution_provider] + if optype not in supported_optype: + raise ValueError( + "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) + ) + return config + + +def dynamic_cpu_check(config, optype, execution_provider, quant_format=None): + if execution_provider != "CPUExecutionProvider": + return config + # TODO: add constraints for other EP + if optype in ["FusedConv", "Conv", "EmbedLayerNormalization", "Gather", "Attention", "LSTM"]: + setattr(config, "per_channel", False) + return config + + +def dynamic_cuda_check(config, optype, execution_provider, quant_format=None): + if execution_provider != "CUDAExecutionProvider": + return config + # current configurations are same as CPU EP + return dynamic_cpu_check(config, optype, execution_provider, quant_format) + + +def dynamic_dml_check(config, optype, execution_provider, quant_format=None): + if execution_provider != "DmlExecutionProvider": + return config + + # don't support dynamic quantization + return None + + +def dynamic_dnnl_check(config, optype, execution_provider, quant_format=None): + if execution_provider != "DnnlExecutionProvider": + return config + # current configurations are same as CPU EP + return dynamic_cpu_check(config, optype, execution_provider, quant_format) + + +def dynamic_trt_check(config, optype, execution_provider, quant_format=None): + if execution_provider != "TensorrtExecutionProvider": + return config + + # don't support dynamic quantization + return None + + +DYNAMIC_CHECK_FUNC_LIST = [ + dynamic_basic_check, + dynamic_cpu_check, + dynamic_cuda_check, + dynamic_dml_check, + dynamic_dnnl_check, + dynamic_trt_check, +] + + +@register_config(algo_name=constants.STATIC_QUANT, priority=constants.PRIORITY_STATIC_QUANT) +class StaticQuantConfig(BaseConfig, ort_quant.StaticQuantConfig): + + supported_configs: List[_OperatorConfig] = [] + params_list: List[str] = [ + "weight_type", + "activation_type", + "per_channel", + "weight_sym", + "activation_sym", + "calibrate_method", + ] + model_params_list: List[str] = [ + "quant_format", + "reduce_range", + "use_external_data_format", + "calibration_sampling_size", + "quant_last_matmul", + ] + name: str = constants.STATIC_QUANT + + def __init__( + self, + calibration_data_reader: data_reader.CalibrationDataReader = None, + calibrate_method=quantization.CalibrationMethod.MinMax, + quant_format=quantization.QuantFormat.QOperator, + activation_type=quantization.QuantType.QInt8, + weight_type=quantization.QuantType.QInt8, + op_types_to_quantize=None, + nodes_to_quantize=None, + nodes_to_exclude=None, + per_channel=False, + reduce_range=False, + use_external_data_format=False, + extra_options=None, + calibration_sampling_size=100, + quant_last_matmul=True, + execution_provider=None, + white_list: list = constants.DEFAULT_WHITE_LIST, + **kwargs, + ): + """This is a class for static Quant Configuration. + + Inherit from StaticQuantConfig: + https://github.com/microsoft/onnxruntime/blob/v1.17.1/onnxruntime/python/tools/quantization/quantize.py#L78 + extra_options: + Support smoothquant args. + - SmoothQuant = True/False : + Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do + fake input channel quantization. + - SmoothQuantAlpha = float : + Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight + and activation quantization. A larger alpha value could be used on models with more significant + activation outliers to migrate more quantization difficulty to weights. + - SmoothQuantFolding = True/False : + Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during + SmoothQuant will be folded into the previous op if the previous op is foldable. + - SmoothQuantOpTypes = list (new args): + Default is ["Gemm", "Conv", "MatMul", "FusedConv"]. It only works if SmoothQuant is True. + It controls the op types to be smooth quantized. + - SmoothQuantCalibIter = int (new args): + Default is 100. It only works if SmoothQuant is True. It controls the iteration num for calibration. + - SmoothQuantScalesPerOp = True/False (new args) : + Default is True. It only works if SmoothQuant is True. + If enabled, each op will have an individual scale, mainlyfor accuracy. + If not enabled, ops with the same input will share a scale, mainly for performance. + """ + if execution_provider is None: + execution_provider = utility.auto_detect_ep() + if op_types_to_quantize is None: + op_types_to_quantize = ( + constants.STATIC_QOPERATOR_OP_LIST_MAP.get(execution_provider, []) + if quant_format == quantization.QuantFormat.QOperator + else constants.STATIC_QDQ_OP_LIST_MAP.get(execution_provider, []) + ) + if not reduce_range and not utility.CpuInfo().vnni: + logger.warning( + "VNNI is not supported and reduce_range=False, reduce_range=True is recommended to avoid potential accuracy issue." + ) + ort_quant.StaticQuantConfig.__init__( + self, + calibration_data_reader=calibration_data_reader, + calibrate_method=calibrate_method, + quant_format=quant_format, + activation_type=activation_type, + weight_type=weight_type, + op_types_to_quantize=op_types_to_quantize, + nodes_to_quantize=nodes_to_quantize, + nodes_to_exclude=nodes_to_exclude, + per_channel=per_channel, + reduce_range=reduce_range, + use_external_data_format=use_external_data_format, + extra_options=extra_options, + ) + # do not load TensorRT if backend is not TensorrtExecutionProvider + if "TensorrtExecutionProvider" in execution_provider: + logger.info("Update some parameters for TensorrtExecutionProvider") + os.environ["ORT_TENSORRT_INT8_ENABLE"] = "0" + self.extra_options.update( + { + "AddQDQPairToWeight": True, + "DedicatedQDQPair": True, + "OpTypesToExcludeOutputQuantization": ["Conv", "Gemm", "Add", "MatMul"], + } + ) + else: + os.environ["ORT_TENSORRT_UNAVAILABLE"] = "1" + + BaseConfig.__init__(self, white_list=self.op_types_to_quantize) + self.execution_provider = execution_provider + self.quant_last_matmul = quant_last_matmul + self.calibration_sampling_size = calibration_sampling_size + _extra_options = ExtraOptions(**self.extra_options) + self.weight_sym = _extra_options.WeightSymmetric + self.activation_sym = _extra_options.ActivationSymmetric + self.optypes_to_exclude_output_quant = _extra_options.OpTypesToExcludeOutputQuantization + self.dedicated_qdq_pair = _extra_options.DedicatedQDQPair + self.add_qdq_pair_to_weight = _extra_options.AddQDQPairToWeight + self.white_list = white_list + self._post_init() + + @staticmethod + def get_model_info(model, white_list=constants.STATIC_QOPERATOR_CPU_OP_LIST) -> list: + if not isinstance(model, onnx.ModelProto): + model = onnx.load(model, load_external_data=False) + + filter_result = [] + for node in model.graph.node: + if node.op_type in white_list: + pair = (node.name, node.op_type) + filter_result.append(pair) + logger.debug(f"Get model info: {filter_result}") + return filter_result + + def get_model_params_dict(self): + result = dict() + for param in self.model_params_list: + result[param] = getattr(self, param) + return result + + def _post_init(self): + for op_name_or_type in self.op_types_to_quantize: + params = self.get_params_dict() + op_config = OperatorConfig(**params) + + for valid_func in STATIC_CHECK_FUNC_LIST: + op_config = valid_func(op_config, op_name_or_type, self.execution_provider, self.quant_format) + self.set_local(op_name_or_type, op_config) + if isinstance(self.white_list, list) and len(self.white_list) > 0: + for op_name_or_type in self.white_list: + global_config = self.get_init_args() + tmp_config = self.__class__(**global_config, white_list=None) + self.set_local(op_name_or_type, tmp_config) + + def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict: + if config_list is None: + config_list = [self] + for config in config_list: + # update model level setting + self._config_mapping.update(config.get_model_params_dict()) + + # update node level setting + global_config = config.global_config + op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() + last_matmul = None + for op_name, op_type in model_info: + if op_type == "MatMul": + last_matmul = op_name + if ( + isinstance(self.op_types_to_quantize, list) + and len(self.op_types_to_quantize) > 0 + and op_type not in self.op_types_to_quantize + ): + continue + if ( + isinstance(self.nodes_to_quantize, list) + and len(self.nodes_to_quantize) > 0 + and op_name not in self.nodes_to_quantize + ): + continue + if ( + isinstance(self.nodes_to_exclude, list) + and len(self.nodes_to_exclude) > 0 + and op_name in self.nodes_to_exclude + ): + continue + if op_type in op_type_config_dict: + self._config_mapping[op_name] = op_type_config_dict[op_type] + for op_name_pattern in op_name_config_dict: + if re.match(op_name_pattern, op_name): + self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] + + if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping: + del self._config_mapping[last_matmul] + return self._config_mapping + + @classmethod + def get_config_set_for_tuning( + cls, + quant_format=quantization.QuantFormat.QOperator, + execution_provider=None, + op_types_to_quantize=None, + nodes_to_exclude=None, + reduce_range=False, + use_external_data_format=False, + calibration_sampling_size=100, + quant_last_matmul=True, + **kwargs, + ) -> Union[None, "StaticQuantConfig", List["StaticQuantConfig"]]: # pragma: no cover + if execution_provider is None: + execution_provider = utility.auto_detect_ep() + StaticQuantConfig.register_supported_configs() + if op_types_to_quantize is None: + op_types_to_quantize = ( + constants.STATIC_QOPERATOR_OP_LIST_MAP.get(execution_provider, []) + if quant_format == quantization.QuantFormat.QOperator + else constants.STATIC_QDQ_OP_LIST_MAP.get(execution_provider, []) + ) + + op_type_candidate = [ + op_types_to_quantize, + list(set(op_types_to_quantize).difference({"Add", "Mul"})), + list(set(op_types_to_quantize).difference({"Add", "Mul", "Gather", "GatherElements", "GatherND"})), + list( + set(op_types_to_quantize).difference( + {"Add", "Mul", "Gather", "GatherElements", "GatherND", "Attention"} + ) + ), + ] + + cfg_lst = [] + for item in op_type_candidate: + cfg_lst.append( + StaticQuantConfig( + execution_provider=execution_provider, + quant_format=quant_format, + reduce_range=reduce_range, + use_external_data_format=use_external_data_format, + calibration_sampling_size=calibration_sampling_size, + op_types_to_quantize=item, + nodes_to_exclude=nodes_to_exclude, + quant_last_matmul=[True, False], + per_channel=[True, False], + **kwargs, + ) + ) + return cfg_lst + + @classmethod + def register_supported_configs(cls) -> None: + supported_configs = [] + supported_configs.append( + _OperatorConfig( + config=OperatorConfig( + weight_type=onnx.TensorProto.UINT8, + weight_sym=False, + per_channel=[True, False], + calibrate_method=[ + quantization.CalibrationMethod.MinMax, + quantization.CalibrationMethod.Entropy, + quantization.CalibrationMethod.Percentile, + ], + activation_type=onnx.TensorProto.UINT8, + activation_sym=False, + ), + operators=["GatherND", "GatherElements", "Gather"], + valid_func_list=STATIC_CHECK_FUNC_LIST, + ) + ) + supported_configs.append( + _OperatorConfig( + config=OperatorConfig( + weight_type=onnx.TensorProto.UINT8, + weight_sym=False, + per_channel=False, + calibrate_method=[ + quantization.CalibrationMethod.MinMax, + quantization.CalibrationMethod.Entropy, + quantization.CalibrationMethod.Percentile, + ], + activation_type=onnx.TensorProto.UINT8, + activation_sym=False, + ), + operators=["EmbedLayerNormalization"], + valid_func_list=STATIC_CHECK_FUNC_LIST, + ) + ) + supported_configs.append( + _OperatorConfig( + config=OperatorConfig( + weight_type=onnx.TensorProto.INT8, + weight_sym=True, + per_channel=[True, False], + calibrate_method=[ + quantization.CalibrationMethod.MinMax, + quantization.CalibrationMethod.Entropy, + quantization.CalibrationMethod.Percentile, + ], + activation_type=onnx.TensorProto.UINT8, + activation_sym=False, + ), + operators=["Conv", "MatMul", "Gemm", "FusedConv"], + valid_func_list=STATIC_CHECK_FUNC_LIST, + ) + ) + supported_configs.append( + _OperatorConfig( + config=OperatorConfig( + weight_type=onnx.TensorProto.INT8, + weight_sym=True, + per_channel=False, + calibrate_method=[ + quantization.CalibrationMethod.MinMax, + quantization.CalibrationMethod.Entropy, + quantization.CalibrationMethod.Percentile, + ], + activation_type=onnx.TensorProto.UINT8, + activation_sym=False, + ), + operators=[ + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Tile", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Attention", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", + "Add", + "Mul", + "ArgMax", + ], + valid_func_list=STATIC_CHECK_FUNC_LIST, + ) + ) + cls.supported_configs = supported_configs + + def to_dict(self): + result = {} + for key, val in self.__dict__.items(): + if key in ["_global_config", "_config_mapping"]: + continue + if key == "_local_config": + local_result = {} + for name, cfg in val.items(): + local_result[name] = cfg.to_dict() + result[key] = local_result + continue + if not isinstance(val, list): + result[key] = ( + getattr(val, "tensor_type", val) + if isinstance(val, quantization.QuantType) + else getattr(val, "value", val) + ) + else: + result[key] = [ + ( + getattr(item, "tensor_type", item) + if isinstance(item, quantization.QuantType) + else getattr(item, "value", item) + ) + for item in val + ] + return result + + +######################## SmoohQuant Config ############################### + + +@register_config(algo_name=constants.SMOOTH_QUANT, priority=constants.PRIORITY_SMOOTH_QUANT) +class SmoothQuantConfig(StaticQuantConfig): + """Smooth quant quantization config.""" + + supported_configs: List[_OperatorConfig] = [] + params_list: List[str] = [ + "weight_type", + "activation_type", + "per_channel", + "weight_sym", + "activation_sym", + "calibrate_method", + ] + model_params_list: List[str] = [ + # smooth parameters + "alpha", + "folding", + "auto_alpha_args", + "calib_iter", + "scales_per_op", + ] + name: str = constants.SMOOTH_QUANT + + def __init__( + self, + alpha: float = 0.5, + folding: bool = True, + op_types: List[str] = ["Gemm", "Conv", "MatMul", "FusedConv"], + calib_iter: int = 100, + scales_per_op: bool = True, + auto_alpha_args: dict = {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}, + white_list: list = None, + **kwargs, + ): + """Init smooth quant config. + + Args: + alpha (float, optional): alpha value to balance the quantization difficulty of activation and weight. + Defaults to 0.5. + folding (bool, optional): whether fold those foldable Mul which are inserted for smooth quant. + Defaults to True. + op_types (list, optional): the op type to be smooth quantized. + Defaults to ["Gemm", "Conv", "MatMul", "FusedConv"]. + calib_iter (int, optional): iteration num for calibration. Defaults to 100. + scales_per_op (bool, optional): True, each op will have an individual scale, mainlyfor accuracy. + False, ops with the same input will share a scale, mainly for performance. Defaults to True. + auto_alpha_args (dict, optional): settings for alpha tuning. + Defaults to {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}. + kwargs (dict): kwargs in below link are supported except calibration_data_reader: + https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/quantize.py#L78 + """ + super().__init__(white_list=white_list, **kwargs) + self.alpha = alpha + self.folding = folding + self.op_types = op_types + self.calib_iter = calib_iter + self.scales_per_op = scales_per_op + self.auto_alpha_args = auto_alpha_args + + @classmethod + def register_supported_configs(cls) -> List[_OperatorConfig]: + supported_configs = [] + smooth_quant_config = SmoothQuantConfig() + operators = ["Gemm", "Conv", "MatMul", "FusedConv"] + supported_configs.append(_OperatorConfig(config=smooth_quant_config, operators=operators)) + cls.supported_configs = supported_configs + + @staticmethod + def get_model_info(model, white_list=["Gemm", "Conv", "MatMul", "FusedConv"]) -> list: + if not isinstance(model, onnx.ModelProto): + model = onnx.load(model, load_external_data=False) + + filter_result = [] + for node in model.graph.node: + if node.op_type in white_list: + pair = (node.name, node.op_type) + filter_result.append(pair) + logger.debug(f"Get model info: {filter_result}") + return filter_result + + @classmethod + def get_config_set_for_tuning( + cls, + ) -> Union[None, "SmoothQuantConfig", List["SmoothQuantConfig"]]: # pragma: no cover + return SmoothQuantConfig(alpha=np.arange(0.3, 0.7, 0.05)) + + +def get_default_sq_config() -> SmoothQuantConfig: + """Generate the default smooth quant config. + + Returns: + the default smooth quant config. + """ + return SmoothQuantConfig() + + +@register_config(algo_name=constants.DYNAMIC_QUANT, priority=constants.PRIORITY_DYNAMIC_QUANT) +class DynamicQuantConfig(BaseConfig, ort_quant.DynamicQuantConfig): + """This is a class for dynamic Quant Configuration. + + Inherit from DynamicQuantConfig: + https://github.com/microsoft/onnxruntime/blob/v1.17.1/onnxruntime/python/tools/quantization/quantize.py#L206 + """ + + supported_configs: List[_OperatorConfig] = [] + params_list: List[str] = [ + "weight_type", + "activation_type", + "per_channel", + "weight_sym", + "activation_sym", + ] + model_params_list: List[str] = [ + "reduce_range", + "use_external_data_format", + "quant_last_matmul", + ] + name: str = constants.DYNAMIC_QUANT + + def __init__( + self, + weight_type: quantization.QuantType = quantization.QuantType.QInt8, + op_types_to_quantize: List[str] = None, + nodes_to_quantize: List[str] = None, + nodes_to_exclude: List[str] = None, + per_channel: bool = False, + reduce_range: bool = False, + use_external_data_format: bool = False, + extra_options: dict = None, + quant_last_matmul: bool = True, + execution_provider: str = None, + white_list: list = constants.DEFAULT_WHITE_LIST, + **kwargs, + ): + if execution_provider is None: + execution_provider = utility.auto_detect_ep() + if op_types_to_quantize is None: + op_types_to_quantize = constants.DYNAMIC_OP_LIST_MAP.get(execution_provider, []) + if not reduce_range and not utility.CpuInfo().vnni: + logger.warning( + "VNNI is not supported and reduce_range=False, reduce_range=True is recommended to avoid potential accuracy issue." + ) + ort_quant.DynamicQuantConfig.__init__( + self, + weight_type=weight_type, + op_types_to_quantize=op_types_to_quantize, + nodes_to_quantize=nodes_to_quantize, + nodes_to_exclude=nodes_to_exclude, + per_channel=per_channel, + reduce_range=reduce_range, + use_external_data_format=use_external_data_format, + extra_options=extra_options, + ) + BaseConfig.__init__(self, white_list=op_types_to_quantize) + self.execution_provider = execution_provider + self.quant_last_matmul = quant_last_matmul + self.activation_type = quantization.QuantType.QUInt8 + _extra_options = ExtraOptions(**self.extra_options) + self.weight_sym = _extra_options.WeightSymmetric + self.activation_sym = _extra_options.ActivationSymmetric + self.white_list = white_list + self._post_init() + + @staticmethod + def get_model_info(model, white_list=constants.DYNAMIC_CPU_OP_LIST) -> list: + if not isinstance(model, onnx.ModelProto): + model = onnx.load(model, load_external_data=False) + + filter_result = [] + for node in model.graph.node: + if node.op_type in white_list: + pair = (node.name, node.op_type) + filter_result.append(pair) + logger.debug(f"Get model info: {filter_result}") + return filter_result + + def get_model_params_dict(self): + result = dict() + for param in self.model_params_list: + result[param] = getattr(self, param) + return result + + def _post_init(self): + for op_name_or_type in self.op_types_to_quantize: + params = self.get_params_dict() + op_config = OperatorConfig(**params) + for valid_func in DYNAMIC_CHECK_FUNC_LIST: + op_config = valid_func(op_config, op_name_or_type, self.execution_provider) + self.set_local(op_name_or_type, op_config) + if isinstance(self.white_list, list) and len(self.white_list) > 0: + for op_name_or_type in self.white_list: + global_config = self.get_init_args() + tmp_config = self.__class__(**global_config, white_list=None) + self.set_local(op_name_or_type, tmp_config) + + def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict: + if config_list is None: + config_list = [self] + for config in config_list: + # update model level setting + self._config_mapping.update(config.get_model_params_dict()) + + # update node level setting + op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() + last_matmul = None + for op_name, op_type in model_info: + if op_type == "MatMul": + last_matmul = op_name + if ( + isinstance(self.op_types_to_quantize, list) + and len(self.op_types_to_quantize) > 0 + and op_type not in self.op_types_to_quantize + ): + continue + if ( + isinstance(self.nodes_to_quantize, list) + and len(self.nodes_to_quantize) > 0 + and op_name not in self.nodes_to_quantize + ): + continue + if ( + isinstance(self.nodes_to_exclude, list) + and len(self.nodes_to_exclude) > 0 + and op_name in self.nodes_to_exclude + ): + continue + if op_type in op_type_config_dict: + self._config_mapping[op_name] = op_type_config_dict[op_type] + for op_name_pattern in op_name_config_dict: + if re.match(op_name_pattern, op_name): + self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] + + if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping: + del self._config_mapping[last_matmul] + return self._config_mapping + + @classmethod + def get_config_set_for_tuning( + cls, + execution_provider=None, + op_types_to_quantize: List[str] = None, + nodes_to_exclude: List[str] = None, + reduce_range: bool = False, + use_external_data_format: bool = False, + quant_last_matmul: bool = True, + ) -> Union[None, "DynamicQuantConfig", List["DynamicQuantConfig"]]: # pragma: no cover + if execution_provider is None: + execution_provider = utility.auto_detect_ep() + if op_types_to_quantize is None: + op_types_to_quantize = constants.DYNAMIC_OP_LIST_MAP.get(execution_provider, []) + + op_type_candidate = [ + op_types_to_quantize, + list(set(op_types_to_quantize).difference({"EmbedLayerNormalization", "Gather", "LSTM"})), + list( + set(op_types_to_quantize).difference({"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv"}) + ), + list( + set(op_types_to_quantize).difference( + {"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv", "Attention"} + ) + ), + list( + set(op_types_to_quantize).difference( + {"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv", "MatMul"} + ) + ), + ] + + cfg_lst = [] + for item in op_type_candidate: + cfg_lst.append( + DynamicQuantConfig( + execution_provider=execution_provider, + op_types_to_quantize=item, + nodes_to_exclude=nodes_to_exclude, + reduce_range=reduce_range, + use_external_data_format=use_external_data_format, + quant_last_matmul=[True, False], + per_channel=[True, False], + ) + ) + return cfg_lst + + @classmethod + def register_supported_configs(cls) -> None: + supported_configs = [] + supported_configs.append( + _OperatorConfig( + config=OperatorConfig( + weight_type=onnx.TensorProto.UINT8, + weight_sym=False, + per_channel=False, + activation_type=onnx.TensorProto.UINT8, + activation_sym=False, + ), + operators=["FusedConv", "Conv", "EmbedLayerNormalization"], + valid_func_list=DYNAMIC_CHECK_FUNC_LIST, + ) + ) + supported_configs.append( + _OperatorConfig( + config=OperatorConfig( + weight_type=onnx.TensorProto.INT8, + weight_sym=True, + per_channel=[True, False], + activation_type=onnx.TensorProto.UINT8, + activation_sym=False, + ), + operators=["MatMul"], + valid_func_list=DYNAMIC_CHECK_FUNC_LIST, + ) + ) + supported_configs.append( + _OperatorConfig( + config=OperatorConfig( + weight_type=onnx.TensorProto.INT8, + weight_sym=True, + per_channel=False, + activation_type=onnx.TensorProto.UINT8, + activation_sym=False, + ), + operators=["Gather", "Attention", "LSTM"], + valid_func_list=DYNAMIC_CHECK_FUNC_LIST, + ) + ) + cls.supported_configs = supported_configs + + def to_dict(self): + result = {} + for key, val in self.__dict__.items(): + if key in ["_global_config", "_config_mapping"]: + continue + if key == "_local_config": + local_result = {} + for name, cfg in val.items(): + local_result[name] = cfg.to_dict() + result[key] = local_result + continue + if not isinstance(val, list): + result[key] = ( + getattr(val, "tensor_type", val) + if isinstance(val, quantization.QuantType) + else getattr(val, "value", val) + ) + else: + result[key] = [ + ( + getattr(item, "tensor_type", item) + if isinstance(item, quantization.QuantType) + else getattr(item, "value", item) + ) + for item in val + ] + return result + + +##################### NC Algo Configs End ################################### + +register_supported_configs() diff --git a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py index 62a671fba..41c58a29f 100644 --- a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py +++ b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py @@ -15,7 +15,7 @@ from typing import List, Union # isort: skip import onnx -from onnxruntime.quantization import matmul_4bits_quantizer +import onnxruntime as ort from onnx_neural_compressor.quantization import matmul_nbits_quantizer @@ -33,8 +33,9 @@ def __init__( is_symmetric: bool = False, accuracy_level: int = 0, nodes_to_exclude=None, - algo_config: matmul_4bits_quantizer.WeightOnlyQuantConfig = None, + algo_config: matmul_nbits_quantizer.WeightOnlyQuantConfig = None, providers: List[str] = ["CPUExecutionProvider"], + optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, ): super().__init__( model=model, @@ -45,4 +46,5 @@ def __init__( algo_config=algo_config, n_bits=4, providers=providers, + optimization_level=optimization_level, ) diff --git a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py index 0d00bbbc5..ea77b18de 100644 --- a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py +++ b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py @@ -14,21 +14,41 @@ from typing import List, Union # isort: skip +import pathlib +import tempfile + import onnx -from onnxruntime.quantization import matmul_4bits_quantizer +import onnxruntime as ort -from onnx_neural_compressor import config, data_reader, logger, onnx_model, utility +from onnx_neural_compressor import data_reader, logger, onnx_model, utility from onnx_neural_compressor.quantization import algorithm_entry as algos +from onnx_neural_compressor.quantization import config + + +class WeightOnlyQuantConfig: + def __init__(self, algorithm): + """This is the Base class for Weight Only Quant Configuration. + Args: + algorithm: + weight only quantize algorithm name. + """ + self.algorithm = algorithm -class RTNWeightOnlyQuantConfig(matmul_4bits_quantizer.RTNWeightOnlyQuantConfig): + +class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig): def __init__(self, ratios=None, layer_wise_quant=False): - super().__init__(ratios=ratios) + super().__init__( + algorithm="RTN", + ) + if ratios is None: + ratios = {} + self.ratios = ratios self.layer_wise_quant = layer_wise_quant -class GPTQWeightOnlyQuantConfig(matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig): +class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig): def __init__( self, @@ -41,17 +61,18 @@ def __init__( layer_wise_quant=False, ): super().__init__( - calibration_data_reader=calibration_data_reader, - percdamp=percdamp, - block_size=block_size, - actorder=actorder, - mse=mse, - perchannel=perchannel, + algorithm="GPTQ", ) + self.calibration_data_reader = calibration_data_reader + self.percdamp = percdamp + self.block_size = block_size + self.actorder = actorder + self.mse = mse + self.perchannel = perchannel self.layer_wise_quant = layer_wise_quant -class AWQWeightOnlyQuantConfig(matmul_4bits_quantizer.WeightOnlyQuantConfig): +class AWQWeightOnlyQuantConfig(WeightOnlyQuantConfig): def __init__( self, @@ -81,15 +102,14 @@ def __init__( is_symmetric: bool = False, accuracy_level: int = 0, nodes_to_exclude: List[str] = None, - algo_config: matmul_4bits_quantizer.WeightOnlyQuantConfig = None, + algo_config: WeightOnlyQuantConfig = None, n_bits: int = 4, providers: List[str] = ["CPUExecutionProvider"], + optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, ): if nodes_to_exclude is None: nodes_to_exclude = [] - self.model_path = model if isinstance(model, str) else None self.model = model - self.model = onnx_model.ONNXModel(onnx.load(model)) if isinstance(model, str) else onnx_model.ONNXModel(model) self.block_size = block_size self.is_symmetric = is_symmetric self.accuracy_level = accuracy_level @@ -98,6 +118,7 @@ def __init__( self.n_bits = n_bits self.providers = providers self.algorithm = self.algo_config.algorithm + self.optimization_level = optimization_level assert self.algorithm in [ "RTN", "AWQ", @@ -106,7 +127,6 @@ def __init__( def _generate_nc_config(self): config_class = config.config_registry.get_cls_configs()[self.algorithm.lower()] - quant_kwargs = { "weight_bits": self.n_bits, "weight_group_size": self.block_size, @@ -124,7 +144,7 @@ def _generate_nc_config(self): quant_kwargs.update( { "percdamp": self.algo_config.percdamp, - "blocksize": self.algo_config.block_size, + "block_size": self.algo_config.block_size, "actorder": self.algo_config.actorder, "mse": self.algo_config.mse, "perchannel": self.algo_config.perchannel, @@ -148,9 +168,33 @@ def _generate_nc_config(self): def int4_quant_algo(self): qconfig = self._generate_nc_config() + model = self.model + opt_tmp_file = tempfile.TemporaryDirectory() + + # do graph optimization if not layer_wise_quant + if ( + not getattr(self.algo_config, "layer_wise_quant", False) + and self.optimization_level != ort.GraphOptimizationLevel.ORT_DISABLE_ALL + ): + if not isinstance(model, str): + onnx.save(model, pathlib.Path(opt_tmp_file.name).joinpath("tmp.onnx").as_posix()) + model = pathlib.Path(opt_tmp_file.name).joinpath("tmp.onnx").as_posix() + logger.info("Start graph optimization...") + sess_options = ort.SessionOptions() + sess_options.graph_optimization_level = self.optimization_level + sess_options.optimized_model_filepath = pathlib.Path(opt_tmp_file.name).joinpath("opt.onnx").as_posix() + sess_options.add_session_config_entry( + "session.optimized_model_external_initializers_file_name", "opt.onnx_data" + ) + sess_options.add_session_config_entry( + "session.optimized_model_external_initializers_min_size_in_bytes", "1024" + ) + session = ort.InferenceSession(model, sess_options) + model = sess_options.optimized_model_filepath + del session + logger.info("Graph optimization done.") logger.info(f"start to quantize model with {self.algorithm} algorithm...") - model = self.model_path or self.model if self.algorithm == "RTN": self.model = algos.rtn_quantize_entry(model, qconfig) elif self.algorithm == "GPTQ": @@ -158,6 +202,7 @@ def int4_quant_algo(self): elif self.algorithm == "AWQ": self.model = algos.awq_quantize_entry(model, qconfig, self.algo_config.calibration_data_reader) logger.info(f"complete quantization of model with {self.algorithm} algorithm.") + opt_tmp_file.cleanup() def process(self): self.int4_quant_algo() diff --git a/onnx_neural_compressor/quantization/quant_utils.py b/onnx_neural_compressor/quantization/quant_utils.py new file mode 100644 index 000000000..2d5518857 --- /dev/null +++ b/onnx_neural_compressor/quantization/quant_utils.py @@ -0,0 +1,47 @@ +# Copyright (c) 2023 MIT HAN Lab +# This source code is licensed under the MIT license +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum + +import onnx + + +class QuantType(enum.Enum): # pragma: no cover + """Represent QuantType value.""" + + QInt8 = 0 + QUInt8 = 1 + + @property + def tensor_type(self): + if self == QuantType.QInt8: + return onnx.TensorProto.INT8 + if self == QuantType.QUInt8: + return onnx.TensorProto.UINT8 + raise ValueError(f"Unexpected value qtype={self!r}.") + + +class QuantFormat(enum.Enum): + QOperator = 0 + QDQ = 1 + + +class CalibrationMethod(enum.Enum): + MinMax = 0 + Entropy = 1 + Percentile = 2 + Distribution = 3 diff --git a/onnx_neural_compressor/quantization/quantize.py b/onnx_neural_compressor/quantization/quantize.py index 7e388e3aa..9fb3dfd41 100644 --- a/onnx_neural_compressor/quantization/quantize.py +++ b/onnx_neural_compressor/quantization/quantize.py @@ -13,32 +13,51 @@ # limitations under the License. import pathlib +import tempfile from typing import Union import onnx +import onnxruntime as ort from onnxruntime.quantization.quantize import QuantConfig -from onnx_neural_compressor import config from onnx_neural_compressor.quantization import algorithm_entry as algos +from onnx_neural_compressor.quantization import config # ORT-like user-facing API def quantize( model_input: Union[str, pathlib.Path, onnx.ModelProto], model_output: Union[str, pathlib.Path], - quant_config: QuantConfig, + quant_config: config.BaseConfig, + optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, ): - if isinstance(quant_config, config.StaticQuantConfig): - if quant_config.extra_options.get("SmoothQuant", False): - nc_sq_config = config.generate_nc_sq_config(quant_config) - algos.smooth_quant_entry( - model_input, nc_sq_config, quant_config.calibration_data_reader, model_output=model_output + with tempfile.TemporaryDirectory(prefix="ort.opt.") as tmp_dir: + if optimization_level != ort.GraphOptimizationLevel.ORT_DISABLE_ALL: + sess_options = ort.SessionOptions() + sess_options.graph_optimization_level = optimization_level + sess_options.optimized_model_filepath = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix() + sess_options.add_session_config_entry( + "session.optimized_model_external_initializers_file_name", "opt.onnx_data" ) + sess_options.add_session_config_entry( + "session.optimized_model_external_initializers_min_size_in_bytes", "1024" + ) + session = ort.InferenceSession(model_input, sess_options) + del session + model_input = sess_options.optimized_model_filepath + + if isinstance(quant_config, config.StaticQuantConfig): + if quant_config.extra_options.get("SmoothQuant", False): + algos.smooth_quant_entry( + model_input, quant_config, quant_config.calibration_data_reader, model_output=model_output + ) + else: + algos.static_quantize_entry( + model_input, quant_config, quant_config.calibration_data_reader, model_output=model_output + ) + elif isinstance(quant_config, config.DynamicQuantConfig): + algos.dynamic_quantize_entry(model_input, quant_config, model_output=model_output) else: - # call static_quant_entry - pass - elif isinstance(quant_config, config.DynamicQuantConfig): - # call dynamic_quant_entry - pass - else: - raise TypeError("Invalid quantization config type, it must be either StaticQuantConfig or DynamicQuantConfig.") + raise TypeError( + "Invalid quantization config type, it must be either StaticQuantConfig or DynamicQuantConfig." + ) diff --git a/onnx_neural_compressor/quantization/tuning.py b/onnx_neural_compressor/quantization/tuning.py index a6743ad7a..5bf2d95d4 100644 --- a/onnx_neural_compressor/quantization/tuning.py +++ b/onnx_neural_compressor/quantization/tuning.py @@ -15,12 +15,17 @@ import copy import os import pathlib +import shutil import tempfile +import traceback import uuid import onnx +import onnxruntime as ort +from onnx import external_data_helper -from onnx_neural_compressor import config, data_reader, logger, utility +from onnx_neural_compressor import data_reader, logger, utility +from onnx_neural_compressor.quantization import config from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Sized, Tuple, Union # isort: skip @@ -99,7 +104,9 @@ def _set_eval_fn_registry(self, user_eval_fns: List[Dict]) -> None: { self.EVAL_FN: user_eval_fn_pair[self.EVAL_FN], self.WEIGHT: user_eval_fn_pair.get(self.WEIGHT, 1.0), - self.FN_NAME: user_eval_fn_pair.get(self.FN_NAME, user_eval_fn_pair[self.EVAL_FN].__name__), + self.FN_NAME: user_eval_fn_pair.get( + self.FN_NAME, getattr(user_eval_fn_pair[self.EVAL_FN], "__name__", "custom_func") + ), } for user_eval_fn_pair in user_eval_fns ] @@ -224,13 +231,29 @@ def __len__(self) -> int: class ConfigLoader: - def __init__(self, config_set: ConfigSet, sampler: Sampler = default_sampler) -> None: + def __init__( + self, config_set: ConfigSet, sampler: Sampler = default_sampler, skip_verified_config: bool = True + ) -> None: self.config_set = ConfigSet.from_fwk_configs(config_set) self._sampler = sampler(self.config_set) + self.skip_verified_config = skip_verified_config + self.verify_config_list = list() + + def is_verified_config(self, config): + for verified_config in self.verify_config_list: + if config == verified_config: + return True + return False def __iter__(self) -> Generator[config.BaseConfig, Any, None]: for index in self._sampler: - yield self.config_set[index] + new_config = self.config_set[index] + if self.skip_verified_config and self.is_verified_config(new_config): + logger.debug("Skip the verified config:") + logger.debug(new_config.to_dict()) + continue + self.verify_config_list.append(new_config) + yield new_config class TuningConfig: @@ -317,13 +340,13 @@ def set_baseline(self, baseline: float): def get_number_of_trials(self): return len(self.tuning_history) - def get_best_quant_config(self) -> config.BaseConfig: - assert self.get_number_of_trials() > 0, "No trial record in tuning monitor." - # Put the record with a higher score at the beginning - sorted_trials_records: List[_TrialRecord] = sorted( - self.tuning_history, key=lambda x: x.trial_result, reverse=True - ) - return sorted_trials_records[0].quant_config + def need_skip(self, config) -> bool: + """Check whether the expanded quant config is verified.""" + if len(self.tuning_history) > 0 and any([config == i.quant_config.config_mapping for i in self.tuning_history]): + logger.warning("Skip the verified config mapping.") + logger.debug(config) + return True + return False def need_stop(self) -> bool: """Check if need to stop tuning. Either accuracy goal is met, max trials is reached or timeout is reached. @@ -343,6 +366,12 @@ def need_stop(self) -> bool: # [-1] is the last element representing the latest trail record. return reach_max_trials or meet_accuracy_goal + def print_config_diff(self, config): + if len(self.tuning_history) == 0: + logger.info("quant config: {}".format(config)) + else: + logger.info("quant config difference: {}".format(config.get_diff_dict(self.tuning_history[0].quant_config))) + class TuningLogger: """A unified logger for the tuning/quantization process. @@ -398,8 +427,6 @@ def _need_apply(quant_config: config.BaseConfig, algo_name): return quant_config.name == algo_name if hasattr(quant_config, "name") else False -# * only for internal usage now -@utility.log_quant_execution def _quantize( model_input: Union[pathlib.Path, str], quant_config: config.BaseConfig, @@ -424,7 +451,7 @@ def _quantize( assert isinstance( quant_config, config.BaseConfig ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}." - logger.info(f"Quantize model with config: \n {quant_config} \n") + logger.debug(f"Quantize model with config: \n {quant_config} \n") # select quantization algo according to config q_model = None @@ -441,6 +468,7 @@ def autotune( eval_fn: Callable, eval_args: Optional[Tuple[Any]] = None, calibration_data_reader: data_reader.CalibrationDataReader = None, + optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, ) -> Union[None, onnx.ModelProto]: """The main entry of auto-tune. @@ -455,63 +483,104 @@ def autotune( During evaluation, autotune will only pass model path as the input of function. eval_args (Optional[Tuple[Any]]): evaluate arguments. Positional arguments for `eval_fn`. - calibration_data_reader (data_reader.CalibrationDataReader): dataloader for calibration. + optimization_level (onnxruntime.GraphOptimizationLevel): graph optimization level. + Support ORT_DISABLE_ALL, ORT_ENABLE_ALL, ORT_ENABLE_BASIC, ORT_ENABLE_EXTENDED. Default is ORT_ENABLE_BASIC. + Details: https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html#onlineoffline-mode """ best_quant_model = None eval_func_wrapper = EvaluationFuncWrapper(eval_fn, eval_args) config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config) + tmp_folder = tempfile.TemporaryDirectory() + pathlib.Path(tmp_folder.name).joinpath("./eval").mkdir() + if optimization_level != ort.GraphOptimizationLevel.ORT_DISABLE_ALL: + sess_options = ort.SessionOptions() + sess_options.graph_optimization_level = optimization_level + sess_options.optimized_model_filepath = pathlib.Path(tmp_folder.name).joinpath("model.onnx").as_posix() + sess_options.add_session_config_entry( + "session.optimized_model_external_initializers_file_name", "model.onnx_data" + ) + sess_options.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "1024") + session = ort.InferenceSession(model_input, sess_options) + + # copy config.json to tmp dir for evaluation, LLMs evaluation may need it + if isinstance(model_input, str) and os.path.exists( + pathlib.Path(model_input).parent.joinpath("config.json").as_posix() + ): + shutil.copyfile( + pathlib.Path(model_input).parent.joinpath("config.json").as_posix(), + pathlib.Path(tmp_folder.name).joinpath("config.json").as_posix(), + ) + + model_input = sess_options.optimized_model_filepath + del session + try: baseline: float = eval_func_wrapper.evaluate(model_input) except Exception as e: - print(e) if "'str' object has no attribute 'SerializeToString'" in str(e): logger.warning("Please refine your eval_fn to accept model path (str) as input.") + if "Unable to load from type ''" in str(e): + logger.warning("Please pass model path to autotune API rather than onnx.ModelProto.") + print(traceback.format_exc()) exit(0) + tuning_monitor.set_baseline(baseline) tuning_logger.tuning_start() for trial_index, quant_config in enumerate(config_loader): + # check whether config_mapping is verified + model_info = quant_config.__class__.get_model_info(model=model_input) + config_mapping = quant_config.to_config_mapping(model_info=model_info) + if tuning_monitor.need_skip(config_mapping): + continue + if calibration_data_reader is not None: calibration_data_reader.rewind() + tuning_logger.trial_start(trial_index=trial_index) tuning_logger.quantization_start() - logger.debug("quant config: {}".format(quant_config)) + tuning_monitor.print_config_diff(quant_config) q_model = _quantize(model_input, quant_config=quant_config, calibration_data_reader=calibration_data_reader) tuning_logger.quantization_end() tuning_logger.evaluation_start() - with tempfile.TemporaryDirectory(prefix="ort.quant.") as tmp_dir: - # evaluate API requires str input - onnx.save_model( - q_model, - pathlib.Path(tmp_dir).joinpath(pathlib.Path(model_input).name).as_posix(), - save_as_external_data=True, - all_tensors_to_one_file=True, - location=pathlib.Path(model_input).with_suffix(pathlib.Path(model_input).suffix + "_data").name, - size_threshold=1024, - convert_attribute=False, - ) - # copy config.json to tmp dir for evaluation, LLMs evaluation may need it - if isinstance(model_input, str) and os.path.exists( - pathlib.Path(model_input).parent.joinpath("config.json").as_posix() - ): - import shutil - - shutil.copyfile( - pathlib.Path(model_input).parent.joinpath("config.json").as_posix(), - pathlib.Path(tmp_dir).joinpath("config.json").as_posix(), - ) - eval_result: float = eval_func_wrapper.evaluate( - pathlib.Path(tmp_dir).joinpath(pathlib.Path(model_input).name).as_posix() + # evaluate API requires str input + onnx.save_model( + q_model, + pathlib.Path(tmp_folder.name).joinpath("./eval/model.onnx").as_posix(), + save_as_external_data=True, + all_tensors_to_one_file=True, + size_threshold=1024, + convert_attribute=False, + ) + # copy config.json to tmp dir for evaluation, LLMs evaluation may need it + if isinstance(model_input, str) and os.path.exists( + pathlib.Path(model_input).parent.joinpath("config.json").as_posix() + ): + shutil.copyfile( + pathlib.Path(model_input).parent.joinpath("config.json").as_posix(), + pathlib.Path(tmp_folder.name).joinpath("./eval/config.json").as_posix(), ) + eval_result: float = eval_func_wrapper.evaluate( + pathlib.Path(tmp_folder.name).joinpath("./eval/model.onnx").as_posix() + ) tuning_logger.evaluation_end() logger.info("Evaluation result: %.4f", eval_result) tuning_monitor.add_trial_result(trial_index, eval_result, quant_config) tuning_logger.trial_end(trial_index) if tuning_monitor.need_stop(): - best_quant_config: config.BaseConfig = tuning_monitor.get_best_quant_config() - best_quant_model = _quantize( - model_input, quant_config=best_quant_config, calibration_data_reader=calibration_data_reader + external_data_helper.load_external_data_for_model( + q_model, pathlib.Path(tmp_folder.name).joinpath("./eval").as_posix() ) + best_quant_model = q_model break + tuning_logger.tuning_end() + if best_quant_model is None: + logger.info( + "Don't find the quantized model which meets accuracy requirement. " + "Please try other configs or adjust tolerable_loss." + ) + exit(0) + + tmp_folder.cleanup() return best_quant_model diff --git a/onnx_neural_compressor/utility.py b/onnx_neural_compressor/utility.py index cc36b6e8a..8bea213b5 100644 --- a/onnx_neural_compressor/utility.py +++ b/onnx_neural_compressor/utility.py @@ -22,8 +22,9 @@ import cpuinfo import numpy as np import onnx +import onnxruntime as ort +import prettytable as pt import psutil -from onnxruntime.quantization import onnx_model from onnx_neural_compressor import constants, logger @@ -75,35 +76,20 @@ class Options: This class is used for configuring global variables. The global variable options is created with this class. If you want to change global variables, you should use functions from onnx_neural_compressor.utility.py: set_random_seed(seed: int) - set_workspace(workspace: str) - set_resume_from(resume_from: str) Args: random_seed(int): Random seed used in neural compressor. Default value is 1978. - workspace(str): The directory where intermediate files and tuning history file are stored. - Default value is: - "./nc_workspace/{}/".format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")). - resume_from(str): The directory you want to resume tuning history file from. - The tuning history was automatically saved in the workspace directory - during the last tune process. - Default value is None. Example:: from onnx_neural_compressor import set_random_seed - from onnx_neural_compressor import set_workspace - from onnx_neural_compressor import set_resume_from set_random_seed(2022) - set_workspace("workspace_path") - set_resume_from("workspace_path") """ - def __init__(self, random_seed=1978, workspace=constants.DEFAULT_WORKSPACE, resume_from=None): + def __init__(self, random_seed=1978): """Init an Option object.""" self.random_seed = random_seed - self.workspace = workspace - self.resume_from = resume_from @property def random_seed(self): @@ -116,71 +102,10 @@ def random_seed(self, random_seed): if check_value("random_seed", random_seed, int): self._random_seed = random_seed - @property - def workspace(self): - """Get workspace.""" - return self._workspace - - @workspace.setter - def workspace(self, workspace): - """Set workspace.""" - if check_value("workspace", workspace, str): - self._workspace = workspace - - @property - def resume_from(self): - """Get resume_from.""" - return self._resume_from - - @resume_from.setter - def resume_from(self, resume_from): - """Set resume_from.""" - if resume_from is None or check_value("resume_from", resume_from, str): - self._resume_from = resume_from - options = Options() -class TuningLogger: - """A unified logger for the tuning/quantization process. - - It assists validation teams in retrieving logs. - """ - - @classmethod - def tuning_start(cls) -> None: - logger.info("Tuning started.") - - @classmethod - def trial_start(cls, trial_index: int = None) -> None: - logger.info("%d-trail started.", trial_index) - - @classmethod - def quantization_start(cls, stacklevel=2) -> None: - logger.info("Quantization started.", stacklevel=stacklevel) - - @classmethod - def quantization_end(cls, stacklevel=2) -> None: - logger.info("Quantization end.", stacklevel=stacklevel) - - @classmethod - def evaluation_start(cls) -> None: - logger.info("Evaluation started.") - - @classmethod - def evaluation_end(cls) -> None: - logger.info("Evaluation end.") - - @classmethod - def trial_end(cls, trial_index: int = None) -> None: - logger.info("%d-trail end.", trial_index) - - @classmethod - def tuning_end(cls) -> None: - logger.info("Tuning completed.") - - def singleton(cls): """Singleton decorator.""" @@ -195,6 +120,48 @@ def _singleton(*args, **kw): return _singleton +class Statistics: + """The statistics printer.""" + + def __init__(self, data, header, field_names, output_handle=logger.info): + """Init a Statistics object. + + Args: + data: The statistics data + header: The table header + field_names: The field names + output_handle: The output logging method + """ + self.field_names = field_names + self.header = header + self.data = data + self.output_handle = output_handle + self.tb = pt.PrettyTable(min_table_width=40) + + def print_stat(self): + """Print the statistics.""" + valid_field_names = [] + for index, value in enumerate(self.field_names): + if index < 2: + valid_field_names.append(value) + continue + + if any(i[index] for i in self.data): + valid_field_names.append(value) + self.tb.field_names = valid_field_names + for i in self.data: + tmp_data = [] + for index, value in enumerate(i): + if self.field_names[index] in valid_field_names: + tmp_data.append(value) + if any(tmp_data[1:]): + self.tb.add_row(tmp_data) + lines = self.tb.get_string().split("\n") + self.output_handle("|" + self.header.center(len(lines[0]) - 2, "*") + "|") + for i in lines: + self.output_handle(i) + + class LazyImport(object): """Lazy import python module till use.""" @@ -296,96 +263,11 @@ def get_number_of_sockets(self) -> int: return 0 -def dump_elapsed_time(customized_msg=""): - """Get the elapsed time for decorated functions. - - Args: - customized_msg (string, optional): The parameter passed to decorator. Defaults to None. - """ - - def f(func): - - def fi(*args, **kwargs): - start = time.time() - res = func(*args, **kwargs) - end = time.time() - logger.info( - "%s elapsed time: %s ms" - % (customized_msg if customized_msg else func.__qualname__, round((end - start) * 1000, 2)) - ) - return res - - return fi - - return f - - def set_random_seed(seed: int): """Set the random seed in config.""" options.random_seed = seed -def set_workspace(workspace: str): - """Set the workspace in config.""" - options.workspace = workspace - - -def set_resume_from(resume_from: str): - """Set the resume_from in config.""" - options.resume_from = resume_from - - -def log_quant_execution(func): - default_tuning_logger = TuningLogger() - - def wrapper(*args, **kwargs): - default_tuning_logger.quantization_start(stacklevel=4) - - # Call the original function - result = func(*args, **kwargs) - - default_tuning_logger.quantization_end(stacklevel=4) - return result - - return wrapper - - -dtype_mapping = { - "fp32": 1, - "float32": 1, - "uint8": 2, - "int8": 3, - "uint16": 4, - "int16": 5, - "int32": 6, - "int64": 7, - "string": 8, - "bool": 9, - "fp16": 10, - "float16": 10, - "double": 11, - "uint32": 12, - "uint64": 13, - "complex64": 14, - "complex128": 15, - "bf16": 16, - "bfloat16": 16, -} - - -def find_by_name(name, item_list): - """Helper function to find item by name in a list.""" - items = [] - for item in item_list: - assert hasattr(item, "name"), "{} should have a 'name' attribute defined".format(item) # pragma: no cover - if item.name == name: - items.append(item) - if len(items) > 0: - return items[0] - else: - return None - - def simple_progress_bar(total, i): """Progress bar for cases where tqdm can't be used.""" progress = i / total @@ -419,157 +301,26 @@ def decorator(algo_func): return decorator -def get_model_info( - model: Union[onnx.ModelProto, pathlib.Path, str], white_op_type_list: List[Callable] -) -> List[Tuple[str, Callable]]: - if not isinstance(model, onnx.ModelProto): - model = onnx.load(model) - filter_result = [] - filter_result_set = set() - for node in model.graph.node: - if node.op_type in white_op_type_list: - pair = (node.name, node.op_type) - if pair not in filter_result_set: - filter_result_set.add(pair) - filter_result.append(pair) - logger.debug(f"Get model info: {filter_result}") - return filter_result - - -def is_B_transposed(node): - """Whether inuput B is transposed.""" - transB = [attr for attr in node.attribute if attr.name == "transB"] - if len(transB): - return 0 < onnx.helper.get_attribute_value(transB[0]) - return False - - -def get_qrange_for_qType(qType, reduce_range=False): - """Helper function to get the quantization range for a type. - - Args: - qType (int): data type - reduce_range (bool, optional): use 7 bit or not. Defaults to False. - """ - if qType == onnx.onnx_pb.TensorProto.UINT8: - return 127 if reduce_range else 255 - elif qType == onnx.onnx_pb.TensorProto.INT8: - # [-64, 64] for reduce_range, and [-127, 127] full_range. - return 128 if reduce_range else 254 +def auto_detect_ep(): + eps = ort.get_available_providers() + if "DnnlExecutionProvider" in eps: + return "DnnlExecutionProvider" + elif "DmlExecutionProvider" in eps: + return "DnnlExecutionProvider" + elif "CUDAExecutionProvider" in eps: + return "CUDAExecutionProvider" else: - raise ValueError("unsupported quantization data type") - + return "CPUExecutionProvider" -def _quantize_data_with_scale_zero(data, qType, scheme, scale, zero_point): - """Quantize data with scale and zero point. - - To pack weights, we compute a linear transformation - - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and - - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where - m = max(abs(rmin), abs(rmax)) - - Args: - data (np.array): data to quantize - qType (int): data type to quantize to. Supported types UINT8 and INT8 - scheme (string): sym or asym quantization. - scale (float): computed scale of quantized data - zero_point (uint8 or int8): computed zero point of quantized data - """ - data = np.asarray(data) - if qType == onnx.onnx_pb.TensorProto.INT8 and scheme == "sym": - # signed byte type - quantized_data = (data.astype(np.float32) / scale).round().astype("b") - elif qType == onnx.onnx_pb.TensorProto.UINT8 and scheme == "asym": - quantized_data = ((data.astype(np.float32) / scale).round() + zero_point).astype("B") - else: - raise ValueError("Unexpected combination of data type {} and scheme {}.".format(qType, scheme)) - return quantized_data - - -def _calculate_scale_zp(rmin, rmax, quantize_range, qType, scheme): - """Calculate scale and zero point.""" - if isinstance(rmax, np.ndarray): - if scheme == "sym": - max_range = np.maximum(abs(rmin), abs(rmax)) - scale = np.ones(rmax.shape, dtype="float32") - scale[max_range > 0] = np.array( - [float(i) / quantize_range for i in (max_range[max_range > 0] * 2.0).flatten().tolist()], - dtype="float32", - ) - else: - scale = np.ones(rmax.shape, dtype="float32") - scale[rmin != rmax] = np.array( - [float(i) / quantize_range for i in (rmax - rmin)[rmin != rmax].flatten().tolist()], dtype="float32" - ) - - if scheme == "sym" and qType == onnx.onnx_pb.TensorProto.INT8: - zero_point = np.zeros(scale.shape, dtype="int8") if isinstance(scale, np.ndarray) else 0 - elif isinstance(scale, np.ndarray) and (scale == 1).all(): - zero_point = ( - np.zeros(scale.shape, dtype="int8") - if qType == onnx.onnx_pb.TensorProto.INT8 - else np.zeros(scale.shape, dtype="uint8") - ) - elif qType == onnx.onnx_pb.TensorProto.UINT8: - zero_point = np.maximum(0, np.minimum(255, ((0 - float(rmin)) / scale).round()).round()).astype("uint8") - else: - zero_point = ( - (-64 - rmin) / float(scale) if quantize_range == 128 else (-127 - rmin) / float(scale) - ).round() +def trt_env_setup(model): + """Set environment variable for Tensorrt Execution Provider.""" + is_int8 = False + for node in model.graph.node: + if node.op_type in ["QuantizeLinear", "DequantizeLinear"]: + is_int8 = True + break + if is_int8: + os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1" else: - if scheme == "sym": - max_range = max(abs(rmin), abs(rmax)) - scale = (float(max_range) * 2) / quantize_range if max_range > 0 else 1 - else: - scale = (float(rmax) - float(rmin)) / quantize_range if rmin != rmax else 1 - - if scale == 1 or (scheme == "sym" and qType == onnx.onnx_pb.TensorProto.INT8): - zero_point = 0 - elif qType == onnx.onnx_pb.TensorProto.UINT8: - zero_point = round((0 - float(rmin)) / scale) - zero_point = np.uint8(round(max(0, min(255, zero_point)))) - else: - zero_point = ( - round((-64 - float(rmin)) / scale) if quantize_range == 128 else round((-127 - float(rmin)) / scale) - ) - return scale, zero_point - - -def quantize_data(data, quantize_range, qType, scheme): - """Quantize data. - - To pack weights, we compute a linear transformation - - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and - - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where - m = max(abs(rmin), abs(rmax)) - and add necessary intermediate nodes to transform quantized weight to full weight - using the equation r = S(q-z), where - r: real original value - q: quantized value - S: scale - z: zero point - - Args: - data (array): data to quantize - quantize_range (list): list of data to weight pack. - qType (int): data type to quantize to. Supported types UINT8 and INT8 - scheme (string): sym or asym quantization. - """ - rmin = min(min(data), 0) - rmax = max(max(data), 0) - - scale, zero_point = _calculate_scale_zp(rmin, rmax, quantize_range, qType, scheme) - quantized_data = _quantize_data_with_scale_zero(data, qType, scheme, scale, zero_point) - return rmin, rmax, zero_point, scale, quantized_data - - -def check_model_with_infer_shapes(model): - """Check if the model has been shape inferred.""" - if isinstance(model, (pathlib.Path, str)): - model = onnx.load(model, load_external_data=False) - elif isinstance(model, onnx_model.ONNXModel): - model = model.model - if len(model.graph.value_info) > 0: - return True - return False + os.environ["ORT_TENSORRT_INT8_ENABLE"] = "0" diff --git a/onnx_neural_compressor/version.py b/onnx_neural_compressor/version.py index aa0978f16..08d071fc2 100644 --- a/onnx_neural_compressor/version.py +++ b/onnx_neural_compressor/version.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -# # Copyright (c) 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/requirements.txt b/requirements.txt index d02ba0d77..7e4911f78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,5 @@ psutil py-cpuinfo pydantic transformers +prettytable +scipy diff --git a/test/quantization/layer_wise/test_layer_wise.py b/test/quantization/layer_wise/test_layer_wise.py index af0bca3e4..7988cd3f6 100644 --- a/test/quantization/layer_wise/test_layer_wise.py +++ b/test/quantization/layer_wise/test_layer_wise.py @@ -10,9 +10,9 @@ import transformers from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, data_reader, logger +from onnx_neural_compressor import data_reader, logger from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import matmul_4bits_quantizer +from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer def find_onnx_file(folder_path): @@ -134,6 +134,7 @@ def test_rtn_layer_wise_with_ort_like_api(self): quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( copy.deepcopy(self.llama), algo_config=algo_config, + optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) quant.process() qmodel = quant.model @@ -145,6 +146,7 @@ def test_rtn_layer_wise_with_ort_like_api(self): quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( copy.deepcopy(self.llama), algo_config=algo_config, + optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) quant.process() qmodel_lwq = quant.model @@ -183,6 +185,7 @@ def test_gptq_layer_wise_with_ort_like_api(self): quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( copy.deepcopy(self.llama), algo_config=algo_config, + optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) quant.process() qmodel = quant.model @@ -196,6 +199,7 @@ def test_gptq_layer_wise_with_ort_like_api(self): quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( copy.deepcopy(self.llama), algo_config=algo_config, + optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) quant.process() qmodel_lwq = quant.model diff --git a/test/quantization/post_training_quant/test_calibrate.py b/test/quantization/post_training_quant/test_calibrate.py new file mode 100644 index 000000000..a02880d4a --- /dev/null +++ b/test/quantization/post_training_quant/test_calibrate.py @@ -0,0 +1,588 @@ +import os +import shutil +import sys +import unittest + +import numpy as np +import onnx + +from onnx_neural_compressor import data_reader +from onnx_neural_compressor.algorithms.post_training_quant import calibrate, calibrator + + +def generate_input_initializer(tensor_shape, tensor_dtype, input_name): + """Helper function to generate initializers for test inputs.""" + tensor = np.random.ranf(tensor_shape).astype(tensor_dtype) + init = onnx.numpy_helper.from_array(tensor, input_name) + return init + + +class DataReader(data_reader.CalibrationDataReader): + + def __init__(self): + self.data_list = [] + self.data_list.append( + { + "input0": np.array([[[[0.45, 0.60, 0.75]], [[0.25, 0.50, 0.75]], [[0.90, 0.70, 0.50]]]]).astype( + np.float32 + ) + } + ) + self.data_list.append( + { + "input0": np.array([[[[0.62, 0.94, 0.38]], [[0.70, 0.13, 0.07]], [[0.89, 0.75, 0.84]]]]).astype( + np.float32 + ) + } + ) + self.data_list.append( + { + "input0": np.array([[[[0.64, 0.24, 0.97]], [[0.82, 0.58, 0.27]], [[0.019, 0.34, 0.02]]]]).astype( + np.float32 + ) + } + ) + self.enum_data = None + + def get_next(self): + if self.enum_data is None: + self.enum_data = iter(self.data_list) + return next(self.enum_data, None) + + def rewind(self): + self.enum_data = None + + +class DataReader2(data_reader.CalibrationDataReader): + + def __init__(self): + self.data_list = [] + self.data_list.append({"A": np.random.random([1, 1, 5, 5]).astype(np.float32)}) + self.data_list.append({"A": np.random.random([1, 1, 5, 5]).astype(np.float32)}) + self.data_list.append({"A": np.random.random([1, 1, 5, 5]).astype(np.float32)}) + self.enum_data = None + + def get_next(self): + if self.enum_data is None: + self.enum_data = iter(self.data_list) + return next(self.enum_data, None) + + def rewind(self): + self.enum_data = None + + +def create_cv_session(): + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) + b_value = np.random.randn(1, 1, 3, 3).astype(np.float32) + B_init = onnx.helper.make_tensor("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3], b_value.reshape(9).tolist()) + D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + conv_node = onnx.helper.make_node("Conv", ["A", "B"], ["C"], name="conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1]) + relu_node = onnx.helper.make_node("Relu", ["C"], ["D"], name="relu") + graph = onnx.helper.make_graph([conv_node, relu_node], "test_graph_1", [A], [D], [B_init]) + model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]}) + dataloader = DataReader2() + return model, dataloader + + +class TestCalibrate(unittest.TestCase): + work_space = "./onnxrt_calib_test" + + @classmethod + def setUpClass(cls): + os.makedirs(cls.work_space) + cls.cv_session = create_cv_session() + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.work_space, ignore_errors=True) + + def test_dump_calibration(self): + model, dataloader = self.cv_session + augment = calibrate.ONNXRTAugment(model, dataloader, ["Conv", "Relu"], iterations=[0]) + calib_params = augment.dump_calibration({}) + self.assertTrue("A" in calib_params and "B" in calib_params and "D" in calib_params and "C" in calib_params) + + def test_augment_graph(self): + """TEST_CONFIG_1.""" + + # Conv + # | + # Clip + # | + # MatMul + + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) + E = onnx.helper.make_tensor_value_info("E", onnx.TensorProto.FLOAT, [1, 1, 5, 1]) + F = onnx.helper.make_tensor_value_info("F", onnx.TensorProto.FLOAT, [1, 1, 5, 1]) + conv_node = onnx.helper.make_node( + "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1] + ) + clip_node = onnx.helper.make_node("Clip", ["C"], ["D"], name="Clip") + matmul_node = onnx.helper.make_node("MatMul", ["D", "E"], ["F"], name="MatMul") + graph = onnx.helper.make_graph([conv_node, clip_node, matmul_node], "test_graph_1", [A, B, E], [F]) + model = onnx.helper.make_model(graph) + + # Augmenting graph + data_reader = None + augment = calibrate.ONNXRTAugment(model, data_reader, ["Conv", "MatMul"]) + augment.augment_graph() + augmented_model = augment.augmented_model + + # Checking if output exists + augmented_model_node_names = [node.name for node in augmented_model.graph.node] + augmented_model_outputs = [output.name for output in augmented_model.graph.output] + added_node_names = ["Conv", "Clip", "MatMul"] + added_outputs = ["A", "B", "C", "D", "E", "F"] + # Original 3 nodes (exclude graph input/output) + self.assertEqual(len(augmented_model_node_names), 3) + # Original 1 graph output + 5 intermediate outputs + self.assertEqual(len(augmented_model_outputs), 6) + for name in added_node_names: + self.assertTrue(name in augmented_model_node_names) + for output in added_outputs: + self.assertTrue(output in augmented_model_outputs) + + print("Finished TEST_CONFIG_1") + """TEST_CONFIG_2.""" + + # Conv + # | + # Conv + + G = onnx.helper.make_tensor_value_info("G", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + H = onnx.helper.make_tensor_value_info("H", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) + J = onnx.helper.make_tensor_value_info("J", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) + K = onnx.helper.make_tensor_value_info("K", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + conv_node_1 = onnx.helper.make_node( + "Conv", ["G", "H"], ["I"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1] + ) + conv_node_2 = onnx.helper.make_node( + "Conv", ["I", "J"], ["K"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1] + ) + graph = onnx.helper.make_graph([conv_node_1, conv_node_2], "test_graph_2", [G, H, J], [K]) + model = onnx.helper.make_model(graph) + + # Augmenting graph + data_reader = None + augment = calibrate.ONNXRTAugment( + model, + data_reader, + ["Conv", "MatMul"], + ) + augment.augment_graph() + augmented_model = augment.augmented_model + + augmented_model_node_names = [node.name for node in augmented_model.graph.node] + augmented_model_outputs = [output.name for output in augmented_model.graph.output] + added_node_names = ["Conv", "Conv"] + added_outputs = ["I", "J", "H", "G", "K"] + # Original 2 nodes + self.assertEqual(len(augmented_model_node_names), 2) + # Original 1 graph output + 4 intermediate outputs + self.assertEqual(len(augmented_model_outputs), 5) + for name in added_node_names: + self.assertTrue(name in augmented_model_node_names) + for output in added_outputs: + self.assertTrue(output in augmented_model_outputs) + + print("Finished TEST_CONFIG_2") + """TEST_CONFIG_3.""" + + # Relu + # | + # Conv \ + # | | + # Clip | + # | / + # MatMul + + L = onnx.helper.make_tensor_value_info("L", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + N = onnx.helper.make_tensor_value_info("N", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) + Q = onnx.helper.make_tensor_value_info("Q", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + relu_node = onnx.helper.make_node("Relu", ["L"], ["M"], name="Relu") + conv_node = onnx.helper.make_node( + "Conv", ["M", "N"], ["O"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1] + ) + clip_node = onnx.helper.make_node("Clip", ["O"], ["P"], name="Clip") + matmul_node = onnx.helper.make_node("MatMul", ["P", "M"], ["Q"], name="MatMul") + graph = onnx.helper.make_graph([relu_node, conv_node, clip_node, matmul_node], "test_graph_3", [L, N], [Q]) + model = onnx.helper.make_model(graph) + + # Augmenting graph + data_reader = None + augment = calibrate.ONNXRTAugment(model, data_reader, ["Conv", "MatMul"]) + augment.augment_graph() + augmented_model = augment.augmented_model + + augmented_model_node_names = [node.name for node in augmented_model.graph.node] + augmented_model_outputs = [output.name for output in augmented_model.graph.output] + added_node_names = ["Relu", "Conv", "Clip", "MatMul"] + added_outputs = ["P", "M", "N", "O", "Q"] + # Original 4 nodes + self.assertEqual(len(augmented_model_node_names), 4) + # Original 1 graph output + 4 intermediate outputs + self.assertEqual(len(augmented_model_outputs), 5) + for name in added_node_names: + self.assertTrue(name in augmented_model_node_names) + for output in added_outputs: + self.assertTrue(output in augmented_model_outputs) + + print("Finished TEST_CONFIG_3") + """TEST_CONFIG_4.""" + + # Attention + # | + # MatMul + + Attention_weight = onnx.helper.make_tensor_value_info("Attention_weight", onnx.TensorProto.FLOAT, [13, 7]) + Attention_bias = onnx.helper.make_tensor_value_info("Attention_bias", onnx.TensorProto.FLOAT, [13, 7]) + Attention_mask = onnx.helper.make_tensor_value_info("Attention_mask", onnx.TensorProto.INT32, [13, 7]) + S = onnx.helper.make_tensor_value_info("S", onnx.TensorProto.FLOAT, [13, 7]) + T = onnx.helper.make_tensor_value_info("T", onnx.TensorProto.FLOAT, [13, 7]) + attention_node = onnx.helper.make_node( + "Attention", ["Attention_weight", "Attention_bias", "Attention_mask"], ["R"], name="Attention" + ) + matmul_node = onnx.helper.make_node("MatMul", ["R", "S"], ["T"], name="MatMul") + graph = onnx.helper.make_graph( + [attention_node, matmul_node], "test_graph_4", [Attention_weight, Attention_bias, Attention_mask, S], [T] + ) + model = onnx.helper.make_model(graph) + + # Augmenting graph + data_reader = None + augment = calibrate.ONNXRTAugment(model, data_reader, ["Conv", "MatMul", "Attention"]) + augment.augment_graph() + augmented_model = augment.augmented_model + + augmented_model_node_names = [node.name for node in augmented_model.graph.node] + augmented_model_outputs = [output.name for output in augmented_model.graph.output] + added_node_names = ["Attention", "MatMul"] + added_outputs = ["R", "Attention_mask", "S", "T", "Attention_bias", "Attention_weight"] + # Original 2 nodes + self.assertEqual(len(augmented_model_node_names), 2) + # Original 1 graph output + 5 intermediate outputs + self.assertEqual(len(augmented_model_outputs), 6) + for name in added_node_names: + self.assertTrue(name in augmented_model_node_names) + for output in added_outputs: + self.assertTrue(output in augmented_model_outputs) + + print("Finished TEST_CONFIG_4") + + # QAttention + # | + # QuantizeLinear + + Attention_input = onnx.helper.make_tensor_value_info("input_quantized", onnx.TensorProto.INT8, [7, 13]) + Attention_weight = onnx.helper.make_tensor_value_info("weight_quantized", onnx.TensorProto.INT8, [13, 7]) + weight_quantized = generate_input_initializer([13, 7], np.int8, "weight_quantized") + Attention_bias = onnx.helper.make_tensor_value_info("bias", onnx.TensorProto.FLOAT, [13, 7]) + bias = generate_input_initializer([13, 7], np.float32, "bias") + Input_scale = onnx.helper.make_tensor_value_info("input_scale", onnx.TensorProto.FLOAT, [1]) + input_scale = generate_input_initializer([1], np.float32, "input_scale") + Weight_scale = onnx.helper.make_tensor_value_info("weight_scale", onnx.TensorProto.FLOAT, [1]) + weight_scale = generate_input_initializer([1], np.float32, "weight_scale") + Attention_mask = onnx.helper.make_tensor_value_info("mask", onnx.TensorProto.INT32, [13, 7]) + mask = generate_input_initializer([13, 7], np.int32, "mask") + Input_zo = onnx.helper.make_tensor_value_info("input_zero_point", onnx.TensorProto.INT8, [1]) + input_zero_point = generate_input_initializer([1], np.int8, "input_zero_point") + Weight_zo = onnx.helper.make_tensor_value_info("weight_zero_point", onnx.TensorProto.INT8, [1]) + weight_zero_point = generate_input_initializer([1], np.int8, "weight_zero_point") + Q_scale = onnx.helper.make_tensor_value_info("attn_output_scale", onnx.TensorProto.FLOAT, [1]) + attn_output_scale = generate_input_initializer([1], np.float32, "attn_output_scale") + Q_zo = onnx.helper.make_tensor_value_info("attn_output_zero_point", onnx.TensorProto.INT8, [1]) + attn_output_zero_point = generate_input_initializer([1], np.int8, "attn_output_zero_point") + Output = onnx.helper.make_tensor_value_info("attn_output_quantized", onnx.TensorProto.INT8, [13, 7]) + attention_node = onnx.helper.make_node( + "QAttention", + [ + "input_quantized", + "weight_quantized", + "bias", + "input_scale", + "weight_scale", + "mask", + "input_zero_point", + "weight_zero_point", + ], + ["attn_output"], + name="attention_quant", + ) + qlinear_node = onnx.helper.make_node( + "QuantizeLinear", + ["attn_output", "attn_output_scale", "attn_output_zero_point"], + ["attn_output_quantized"], + name="attn_output_QuantizeLinear", + ) + graph = onnx.helper.make_graph( + [attention_node, qlinear_node], + "test_graph_5", + [ + Attention_input, + Attention_weight, + Attention_bias, + Input_scale, + Weight_scale, + Attention_mask, + Input_zo, + Weight_zo, + Q_scale, + Q_zo, + ], + [Output], + ) + graph.initializer.add().CopyFrom(weight_quantized) + graph.initializer.add().CopyFrom(bias) + graph.initializer.add().CopyFrom(input_scale) + graph.initializer.add().CopyFrom(weight_scale) + graph.initializer.add().CopyFrom(mask) + graph.initializer.add().CopyFrom(input_zero_point) + graph.initializer.add().CopyFrom(weight_zero_point) + graph.initializer.add().CopyFrom(attn_output_scale) + graph.initializer.add().CopyFrom(attn_output_zero_point) + model = onnx.helper.make_model(graph) + + # Augmenting graph + data_reader = None + augment = calibrate.ONNXRTAugment(model, data_reader, [], white_nodes=["attention"]) + augment.augment_nodes = ["DequantizeLinear"] + augment.already_quantized = True + + augment.augment_graph() + augmented_model = augment.augmented_model + + augmented_model_node_names = [node.name for node in augmented_model.graph.node] + augmented_model_outputs = [output.name for output in augmented_model.graph.output] + added_node_names = ["attention_quant", "attn_output_QuantizeLinear", "input_quantized_DequantizeLinear"] + added_outputs = ["attn_output_quantized", "input_quantized_output", "attn_output"] + self.assertEqual(len(augmented_model_node_names), 3) + self.assertEqual(len(augmented_model_outputs), 3) + for name in added_node_names: + self.assertTrue(name in augmented_model_node_names) + for output in added_outputs: + self.assertTrue(output in augmented_model_outputs) + + print("Finished TEST_CONFIG_5") + + # QuantizeLinear + # | + # QLinearConv + # | + # DequantizeLinear + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + A_scale = onnx.helper.make_tensor_value_info("A_scale", onnx.TensorProto.FLOAT, [1]) + a_scale = generate_input_initializer([1], np.float32, "A_scale") + A_zo = onnx.helper.make_tensor_value_info("A_zero_point", onnx.TensorProto.INT8, [1]) + a_zero_point = generate_input_initializer([1], np.int8, "A_zero_point") + C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.INT8, [1, 1, 5, 5]) + c = generate_input_initializer([1, 1, 5, 5], np.int8, "C") + C_scale = onnx.helper.make_tensor_value_info("C_scale", onnx.TensorProto.FLOAT, [1]) + c_scale = generate_input_initializer([1], np.float32, "C_scale") + C_zo = onnx.helper.make_tensor_value_info("C_zero_point", onnx.TensorProto.INT8, [1]) + c_zero_point = generate_input_initializer([1], np.int8, "C_zero_point") + E = onnx.helper.make_tensor_value_info("E", onnx.TensorProto.INT32, [1]) + e = generate_input_initializer([1], np.int32, "E") + D_scale = onnx.helper.make_tensor_value_info("D_scale", onnx.TensorProto.FLOAT, [1]) + d_scale = generate_input_initializer([1], np.float32, "D_scale") + D_zo = onnx.helper.make_tensor_value_info("D_zero_point", onnx.TensorProto.INT8, [1]) + d_zero_point = generate_input_initializer([1], np.int8, "D_zero_point") + D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + quantize_node = onnx.helper.make_node( + "QuantizeLinear", ["A", "A_scale", "A_zero_point"], ["A_quantized"], name="A_QuantizeLinear" + ) + conv_node = onnx.helper.make_node( + "QLinearConv", + [ + "A_quantized", + "A_scale", + "A_zero_point", + "C_quantized", + "C_scale", + "C_zero_point", + "D_scale", + "D_zero_point", + "E", + ], + ["D_quantized"], + name="conv_quant", + kernel_shape=[3, 3], + pads=[1, 1, 1, 1], + ) + dequantize_node = onnx.helper.make_node( + "DequantizeLinear", ["D_quantized", "D_scale", "D_zero_point"], ["D"], name="D_DequantizeLinear" + ) + graph = onnx.helper.make_graph( + [quantize_node, conv_node, dequantize_node], + "test_graph_5", + [A, A_scale, A_zo, C, C_scale, C_zo, E, D_scale, D_zo], + [D], + ) + graph.initializer.add().CopyFrom(a_scale) + graph.initializer.add().CopyFrom(a_zero_point) + graph.initializer.add().CopyFrom(c) + graph.initializer.add().CopyFrom(c_scale) + graph.initializer.add().CopyFrom(c_zero_point) + graph.initializer.add().CopyFrom(e) + graph.initializer.add().CopyFrom(d_scale) + graph.initializer.add().CopyFrom(d_zero_point) + model = onnx.helper.make_model(graph) + + # Augmenting graph + data_reader = None + augment = calibrate.ONNXRTAugment(model, data_reader, [], white_nodes=["conv"]) + augment.augment_nodes = ["DequantizeLinear"] + augment.already_quantized = True + augment.augment_graph() + augmented_model = augment.augmented_model + + augmented_model_node_names = [node.name for node in augmented_model.graph.node] + augmented_model_outputs = [output.name for output in augmented_model.graph.output] + added_node_names = [ + "A_QuantizeLinear", + "conv_quant", + "D_DequantizeLinear", + "D_quantized_DequantizeLinear", + "A_quantized_DequantizeLinear", + ] + added_outputs = ["D", "D_quantized_output", "A_quantized_output"] + self.assertEqual(len(augmented_model_node_names), 5) + self.assertEqual(len(augmented_model_outputs), 3) + for name in added_node_names: + self.assertTrue(name in augmented_model_node_names) + for output in added_outputs: + self.assertTrue(output in augmented_model_outputs) + + def test_quant_param_calculation(self): + """TEST_CONFIG_6.""" + + # Relu + # | \ + # Conv \ + # | \ + # Relu | + # | Conv + # Conv / + # \ / + # | + # Add + + input0 = onnx.helper.make_tensor_value_info("input0", onnx.TensorProto.FLOAT, [1, 3, 1, 3]) + output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, [1, 3, 1, 3]) + + X1_weight = generate_input_initializer([3, 3, 1, 1], np.float32, "X1_weight") + X1_bias = generate_input_initializer([3], np.float32, "X1_bias") + X3_weight = generate_input_initializer([3, 3, 1, 1], np.float32, "X3_weight") + X3_bias = generate_input_initializer([3], np.float32, "X3_bias") + X5_weight = generate_input_initializer([3, 3, 1, 1], np.float32, "X5_weight") + X5_bias = generate_input_initializer([3], np.float32, "X5_bias") + + relu_node_1 = onnx.helper.make_node("Relu", ["input0"], ["X1"], name="Relu1") + conv_node_1 = onnx.helper.make_node("Conv", ["X1", "X1_weight", "X1_bias"], ["X2"], name="Conv1") + relu_node_2 = onnx.helper.make_node("Relu", ["X2"], ["X3"], name="Relu2") + conv_node_2 = onnx.helper.make_node("Conv", ["X3", "X3_weight", "X3_bias"], ["X4"], name="Conv2") + conv_node_3 = onnx.helper.make_node("Conv", ["X1", "X5_weight", "X5_bias"], ["X5"], name="Conv3") + add_node = onnx.helper.make_node("Add", ["X4", "X5"], ["output"], name="Add") + + graph = onnx.helper.make_graph( + [relu_node_1, conv_node_1, relu_node_2, conv_node_2, conv_node_3, add_node], + "test_graph_5", + [input0], + [output], + ) + graph.initializer.add().CopyFrom(X1_weight) + graph.initializer.add().CopyFrom(X1_bias) + graph.initializer.add().CopyFrom(X3_weight) + graph.initializer.add().CopyFrom(X3_bias) + graph.initializer.add().CopyFrom(X5_weight) + graph.initializer.add().CopyFrom(X5_bias) + model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]}) + data_reader = DataReader() + augment = calibrate.ONNXRTAugment(model, data_reader, ["Conv", "MatMul"]) + + # test calculation of quantization params + data_reader.rewind() + quantization_params_dict = augment.dump_calibration({}) + data_reader.rewind() + node_output_names, output_dicts_list = augment.get_intermediate_outputs({}) + data_reader.rewind() + dict_for_quantization = augment._map_calibration(node_output_names, output_dicts_list) + # check the size of the quantization dictionary + + self.assertEqual(len(quantization_params_dict), 12) + + # check the computation of zp and scale + for key, value in quantization_params_dict.items(): + self.assertTrue(value is not None) + self.assertTrue(len(value) == 2) + + thresholds = dict_for_quantization[key] + rmin = min(thresholds[0], 0) + rmax = max(thresholds[1], 0) + if key == "X2": # next_node is Relu + if rmin < 0: + rmin = 0 + + scale_expected = np.float32((rmax - rmin) / 255 if rmin != rmax else 1) + zp_expected = np.uint8(round(max(0, min(255, (0 - rmin) / scale_expected)))) + zp_actual = value[0] + scale_actual = value[1] + + self.assertAlmostEqual(zp_expected, zp_actual) + self.assertAlmostEqual(scale_expected, scale_actual) + + print("Finished" + " test calculation of quantization params.") + + def test_calibrator(self): + regular_data = [np.arange(15).reshape(3, 5).astype("float32"), np.arange(15).reshape(3, 5).astype("float32")] + irregular_data = [np.arange(10).reshape(2, 5).astype("float32"), np.arange(5).reshape(1, 5).astype("float32")] + + calib = calibrator.CALIBRATOR["MinMax"]() + calib.collect(irregular_data) + res = calib.calib_range + self.assertEqual(res[0], np.array(0.0).astype(np.float32)) + self.assertEqual(res[1], np.array(9.0).astype(np.float32)) + calib.collect(regular_data) + res = calib.calib_range + self.assertEqual(res[0], np.array(0.0).astype(np.float32)) + self.assertEqual(res[1], np.array(14.0).astype(np.float32)) + calib.clear() + res = calib.calib_range + self.assertIsNone(res[0]) + self.assertIsNone(res[1]) + del calib + + calib = calibrator.CALIBRATOR["Entropy"]() + calib.collect(irregular_data) + res = calib.calib_range + self.assertEqual(res[0], np.array(0.0).astype(np.float32)) + self.assertEqual(res[1], np.array(9.0).astype(np.float32)) + calib.collect(regular_data) + res = calib.calib_range + self.assertEqual(res[0], np.array(0.0).astype(np.float32)) + self.assertEqual(res[1], np.array(9.140625).astype(np.float32)) + calib.clear() + res = calib.calib_range + self.assertIsNone(res[0]) + self.assertIsNone(res[1]) + del calib + + calib = calibrator.CALIBRATOR["Percentile"]() + calib.collect(irregular_data) + res = calib.calib_range + self.assertEqual(res[0], np.array(0.0).astype(np.float32)) + self.assertEqual(res[1], np.array(8.991211).astype(np.float32)) + calib.collect(regular_data) + res = calib.calib_range + self.assertEqual(res[0], np.array(0.0).astype(np.float32)) + self.assertEqual(res[1], np.array(13.9921875).astype(np.float32)) + calib.clear() + res = calib.calib_range + self.assertIsNone(res[0]) + self.assertIsNone(res[1]) + del calib + + +if __name__ == "__main__": + unittest.main() diff --git a/test/quantization/post_training_quant/test_operators.py b/test/quantization/post_training_quant/test_operators.py new file mode 100644 index 000000000..45c189328 --- /dev/null +++ b/test/quantization/post_training_quant/test_operators.py @@ -0,0 +1,1957 @@ +import collections +import copy +import os +import shutil +import unittest + +import numpy as np +import onnx +import onnxruntime as ort + +from onnx_neural_compressor import quantization +from onnx_neural_compressor.algorithms.post_training_quant import quantizer + + +def build_model(): + initializers = [] + input = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 3, 15, 15]) + output = onnx.helper.make_tensor_value_info("add_out_2", onnx.TensorProto.FLOAT, [88, 11]) + + add_node = onnx.helper.make_node("Add", ["input", "add_init"], ["add_out"], name="add") + + conv1_weight_initializer = onnx.numpy_helper.from_array( + np.random.randint(-1, 2, [3, 3, 3, 3]).astype(np.float32), name="conv1_weight" + ) + conv1_node = onnx.helper.make_node("Conv", ["add_out", "conv1_weight"], ["conv1_output"], name="conv1") + + conv2_weight_initializer = onnx.numpy_helper.from_array( + np.random.randint(-1, 2, [5, 3, 3, 3]).astype(np.float32), name="conv2_weight" + ) + conv2_node = onnx.helper.make_node("Conv", ["add_out", "conv2_weight"], ["conv2_output"], name="conv2") + + # 1, 8, 13, 13 + concat_node = onnx.helper.make_node( + "Concat", ["conv1_output", "conv2_output"], ["concat_output"], name="Concat", axis=1 + ) + # 1, 8, 11, 11 + avg_args = {"kernel_shape": [3, 3]} + avgpool_node = onnx.helper.make_node( + "AveragePool", ["concat_output"], ["avg_output"], name="AveragePool", **avg_args + ) + reshape_node = onnx.helper.make_node("Reshape", ["avg_output", "shape"], ["reshape_output"], name="Reshape") + + add_node_2 = onnx.helper.make_node("Add", ["reshape_output", "add_init_2"], ["add_out_2"], name="add_2") + + initializers = [conv1_weight_initializer, conv2_weight_initializer] + initializers.append(onnx.numpy_helper.from_array(np.array([88, 11], dtype=np.int64), name="shape")) + initializers.append(onnx.numpy_helper.from_array(np.zeros((1, 3, 15, 15)).astype("float32"), name="add_init")) + initializers.append(onnx.numpy_helper.from_array(np.zeros((88, 11)).astype("float32"), name="add_init_2")) + + graph = onnx.helper.make_graph( + [conv1_node, conv2_node, concat_node, avgpool_node, reshape_node, add_node, add_node_2], + "test", + [input], + [output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + return model + + +class TestQuantizer(unittest.TestCase): + qlinear_backend = "qoperator" + qdq_backend = "qdq" + + q_config = { + "weight_type": 3, + "activation_type": 2, + "per_channel": False, + "weight_sym": True, + "activation_sym": False, + "calibrate_method": "MinMax", + } + + @classmethod + def setUpClass(cls): + os.makedirs("./onnxrt_test") + + @classmethod + def tearDownClass(cls): + shutil.rmtree("./onnxrt_test", ignore_errors=True) + + def qlinear_test(self, model, q_config, quantize_params, quantizable_op_types, **kwargs): + quant = quantizer.StaticQuantizer( + model=copy.deepcopy(model), + q_config=q_config, + quant_format=self.qlinear_backend, + quantization_params=quantize_params, + op_types_to_quantize=quantizable_op_types, + **kwargs, + ) + quant.quantize_model() + assert quant.model.model + return quant.model + + def qdq_test(self, model, q_config, quantize_params, quantizable_op_types, **kwargs): + quant = quantizer.StaticQuantizer( + model=copy.deepcopy(model), + q_config=q_config, + quant_format=self.qdq_backend, + quantization_params=quantize_params, + op_types_to_quantize=quantizable_op_types, + **kwargs, + ) + quant.quantize_model() + assert quant.model.model + return quant.model + + def dynamic_test(self, model, q_config, quantize_params, quantizable_op_types, **kwargs): + quant = quantizer.DynamicQuantizer( + model=copy.deepcopy(model), + q_config=q_config, + quantization_params=quantize_params, + op_types_to_quantize=quantizable_op_types, + **kwargs, + ) + quant.quantize_model() + assert quant.model.model + return quant.model + + def test_resize(self): + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 2, 26, 42]) + + conv_weight_arr = np.random.randint(-1, 2, [3, 2, 3, 3]).astype(np.float32) + conv_weight_initializer = onnx.numpy_helper.from_array(conv_weight_arr, name="conv1_weight") + conv_node = onnx.helper.make_node("Conv", ["input", "conv1_weight"], ["conv_output"], name="conv_node") + + initializers = [conv_weight_initializer] + + output_tensor = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, [1, 3, 48, 80]) + resize_inputs = ["conv_output"] # resize_roi_name, resize_scales_name, resize_sizes_name] + resize_attrs = {"coordinate_transformation_mode": "asymmetric", "mode": "nearest", "nearest_mode": "floor"} + resize_node = onnx.helper.make_node("Resize", resize_inputs, ["output"], name="resize_node", **resize_attrs) + resize_roi = [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0] + resize_roi_name = "resize_roi" + resize_roi_initializer = onnx.helper.make_tensor( + resize_roi_name, onnx.TensorProto.FLOAT, [len(resize_roi)], resize_roi + ) + initializers.extend([resize_roi_initializer]) + resize_node.input.extend([resize_roi_name]) + + resize_scales = [1.0, 1.0, 2.0, 2.0] + resize_scales_name = "resize_scales" + resize_scales_initializer = onnx.helper.make_tensor( + resize_scales_name, onnx.TensorProto.FLOAT, [len(resize_scales)], resize_scales + ) + initializers.extend([resize_scales_initializer]) + resize_node.input.extend([resize_scales_name]) + + graph = onnx.helper.make_graph( + [conv_node, resize_node], + "TestOpQuantizerResize_test_model", + [input_tensor], + [output_tensor], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 # use stable onnx ir version + + q_config = {"conv_node": self.q_config, "resize_node": self.q_config} + quantize_params = { + "input": [np.uint8(0), np.float32(10.0)], + "conv1_weight": [np.uint8(0), np.float32(10.0)], + "conv_output": [np.uint8(0), np.float32(10.0)], + "output": [np.uint8(0), np.float32(10.0)], + } + + q_model = self.qlinear_test(model, q_config, quantize_params, ["Resize", "Conv"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + + q_model = self.qdq_test(model, q_config, quantize_params, ["Resize", "Conv"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3) + + # test opset version 10 + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 10)]) + model.ir_version = 7 # use stable onnx ir version + + q_model = self.qlinear_test(model, q_config, quantize_params, ["Resize", "Conv"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + + q_model = self.qdq_test(model, q_config, quantize_params, ["Resize", "Conv"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + + def test_argmax(self): + input_name = "input" + output_name = "output" + input_shape = [1, 256, 128, 128] + output_shape = [1, 32, 128] + initializers = [] + + # make Conv node + conv_weight_name = "conv_weight" + conv_weight_arr = np.random.randint(-1, 2, [32, 256, 1, 1]).astype(np.float32) + conv_weight_initializer = onnx.numpy_helper.from_array(conv_weight_arr, name=conv_weight_name) + conv_output_name = "conv_output" + conv_inputs = [input_name, conv_weight_name] + conv_outputs = [conv_output_name] + conv_name = "conv_node" + conv_node = onnx.helper.make_node( + "Conv", + conv_inputs, + conv_outputs, + dilations=[1, 1], + kernel_shape=[1, 1], + pads=[0, 0, 0, 0], + strides=[1, 1], + name=conv_name, + ) + + # make ArgMax node + argmax_inputs = [conv_output_name] + argmax_outputs = [output_name] + argmax_name = "argmax_node" + argmax_node = onnx.helper.make_node( + "ArgMax", + argmax_inputs, + argmax_outputs, + axis=3, + keepdims=0, + name=argmax_name, + ) + + initializers = [conv_weight_initializer] + + # make graph + input_tensor = onnx.helper.make_tensor_value_info(input_name, onnx.TensorProto.FLOAT, input_shape) + output_tensor = onnx.helper.make_tensor_value_info(output_name, onnx.TensorProto.INT64, output_shape) + graph_name = "ArgMax_Quant_Test" + graph = onnx.helper.make_graph( + [conv_node, argmax_node], + graph_name, + [input_tensor], + [output_tensor], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 # use stable onnx ir version + q_config = {"conv_node": self.q_config, "argmax_node": self.q_config} + quantize_params = { + "input": [np.uint8(0), np.float32(10.0)], + "conv_weight": [np.uint8(0), np.float32(10.0)], + "conv_output": [np.uint8(0), np.float32(10.0)], + "output": [np.uint8(0), np.float32(10.0)], + } + q_model = self.qlinear_test(model, q_config, quantize_params, ["Conv", "ArgMax"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + + def test_gemm(self): + input_name = "input" + output_name = "output" + initializers = [] + weight_shape = [100, 10] + weight_name = "linear1.weight" + bias_shape = [100] + bias_name = "linear1.bias" + node_name = "gemm" + + weight_data = np.random.normal(0, 0.1, weight_shape).astype(np.float32) + initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name)) + + bias_data = np.random.normal(0, 0.1, bias_shape).astype(np.float32) + initializers.append(onnx.numpy_helper.from_array(bias_data, name=bias_name)) + + gemm1_node = onnx.helper.make_node( + "Gemm", [input_name, weight_name, bias_name], [output_name], alpha=1.0, beta=1.0, transB=1, name=node_name + ) + + gemm1_output_name = "gemm1_output" + input_tensor = onnx.helper.make_tensor_value_info(input_name, onnx.TensorProto.FLOAT, [-1, 10]) + output_tensor = onnx.helper.make_tensor_value_info(output_name, onnx.TensorProto.FLOAT, [-1, 100]) + graph_name = "gemm_test" + graph = onnx.helper.make_graph( + [gemm1_node], + graph_name, + [input_tensor], + [output_tensor], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 # use stable onnx ir version + q_config = {"gemm": self.q_config} + quantize_params = { + "input": [np.uint8(0), np.float32(10.0)], + "linear1.weight": [np.uint8(0), np.float32(10.0)], + "linear1.bias": [np.uint8(0), np.float32(10.0)], + "output": [np.uint8(0), np.float32(10.0)], + } + q_model = self.qlinear_test(model, q_config, quantize_params, ["Gemm"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + q_model = self.qdq_test(model, q_config, quantize_params, ["Gemm"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + + # test gemm with non-constant bias + bias_tensor = onnx.helper.make_tensor_value_info(bias_name, onnx.TensorProto.FLOAT, [100]) + gemm2_node = onnx.helper.make_node( + "Gemm", [input_name, weight_name, bias_name], [output_name], alpha=1.0, beta=1.0, transB=1, name=node_name + ) + initializers = [] + initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name)) + graph_name = "gemm_test" + graph = onnx.helper.make_graph( + [gemm2_node], + graph_name, + [input_tensor, bias_tensor], + [output_tensor], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 + q_model = self.qlinear_test(model, q_config, quantize_params, ["Gemm"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 0) + q_model = self.qdq_test(model, q_config, quantize_params, ["Gemm"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + + def test_embed(self): + input_ids_shape = [1, 4] + input_ids_tensor = onnx.helper.make_tensor_value_info("input_ids", onnx.TensorProto.INT32, input_ids_shape) + + segment_ids_shape = [1, 4] + segment_ids_tensor = onnx.helper.make_tensor_value_info( + "segment_ids", onnx.TensorProto.INT32, segment_ids_shape + ) + + # EmbedLayerNormalization Node Constants and Weights: + word_embed_shape = [32, 4] + word_embed_weights = np.random.random_sample(word_embed_shape).astype(dtype="float32") + word_embed_initializer = onnx.numpy_helper.from_array(word_embed_weights, name="word_embed") + + pos_embed_shape = [16, 4] + pos_embed_weights = np.random.random_sample(pos_embed_shape).astype(dtype="float32") + pos_embed_initializer = onnx.numpy_helper.from_array(pos_embed_weights, name="pos_embed") + + seg_embed_shape = [2, 4] + seg_embed_weights = np.random.random_sample(seg_embed_shape).astype(dtype="float32") + seg_embed_initializer = onnx.numpy_helper.from_array(seg_embed_weights, name="seg_embed") + + gamma_shape = [4] + gamma = np.random.random_sample(gamma_shape).astype(dtype="float32") + gamma_initializer = onnx.numpy_helper.from_array(gamma, name="gamma") + + beta_shape = [4] + beta = np.random.random_sample(beta_shape).astype(dtype="float32") + beta_initializer = onnx.numpy_helper.from_array(beta, name="beta") + + # EmbedLayerNormalization Outputs: + layernorm_out_shape = [1, 4, 4] + layernorm_out_tensor = onnx.helper.make_tensor_value_info( + "layernorm_out", onnx.TensorProto.FLOAT, layernorm_out_shape + ) + + mask_index_out_shape = [1] + mask_index_out_tensor = onnx.helper.make_tensor_value_info( + "mask_index_out", onnx.TensorProto.INT32, mask_index_out_shape + ) + + # EmbedLayerNormalization Node: + embed_layer_norm_inputs = ["input_ids", "segment_ids", "word_embed", "pos_embed", "seg_embed", "gamma", "beta"] + embed_layer_norm_outputs = ["layernorm_out", "mask_index_out"] + embed_layer_norm_node = onnx.helper.make_node( + "EmbedLayerNormalization", + embed_layer_norm_inputs, + embed_layer_norm_outputs, + domain="com.microsoft", + name="Embed", + ) + + # Construct the Graph and Model: + nodes = [embed_layer_norm_node] + graph_name = "embed_layernorm_graph" + inputs = [input_ids_tensor, segment_ids_tensor] + outputs = [layernorm_out_tensor, mask_index_out_tensor] + initializers = [ + word_embed_initializer, + pos_embed_initializer, + seg_embed_initializer, + gamma_initializer, + beta_initializer, + ] + + graph = onnx.helper.make_graph(nodes, graph_name, inputs, outputs, initializer=initializers) + model = onnx.helper.make_model( + graph, + opset_imports=[onnx.helper.make_opsetid("com.microsoft", 14), onnx.helper.make_opsetid("ai.onnx", 14)], + ) + model.ir_version = 7 # use stable onnx ir version + + q_config = {"Embed": self.q_config} + quantize_params = { + "word_embed": [np.uint8(10.0), np.float32(0)], + "pos_embed": [np.uint8(10.0), np.float32(0)], + "seg_embed": [np.uint8(10.0), np.float32(0)], + "gamma": [np.uint8(10.0), np.float32(0)], + "beta": [np.uint8(10.0), np.float32(0)], + "layernorm_out": [np.uint8(10.0), np.float32(0)], + "mask_index_out": [np.uint8(10.0), np.float32(0)], + "input_ids": [np.uint8(10.0), np.float32(0)], + } + q_model = self.qlinear_test(model, q_config, quantize_params, ["EmbedLayerNormalization"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["QEmbedLayerNormalization"], 1 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, ["EmbedLayerNormalization"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 5 + ) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["EmbedLayerNormalization"], 1 + ) + + def test_LSTM(self): + input_shape = [1, 1, 200] + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, input_shape) + + w_shape = [2, 400, 200] + w_weights = np.random.random_sample(w_shape).astype(dtype="float32") + w_init = onnx.numpy_helper.from_array(w_weights, name="w") + + r_shape = [2, 400, 100] + r_weights = np.random.random_sample(r_shape).astype(dtype="float32") + r_init = onnx.numpy_helper.from_array(r_weights, name="r") + + b_shape = [2, 800] + b_weights = np.random.random_sample(b_shape).astype(dtype="float32") + b_init = onnx.numpy_helper.from_array(b_weights, name="b") + + out_shape = [1, 2, 1, 100] + out_tensor = onnx.helper.make_tensor_value_info("out", onnx.TensorProto.FLOAT, out_shape) + + kwargs = {} + kwargs["direction"] = "bidirectional" + kwargs["activations"] = ["Sigmoid", "Tanh", "Tanh", "Sigmoid", "Tanh", "Tanh"] + kwargs["hidden_size"] = 100 + kwargs["input_forget"] = 0 + + lstm_node = onnx.helper.make_node("LSTM", ["input", "w", "r", "b"], ["out"], name="lstm", domain="", **kwargs) + graph = onnx.helper.make_graph( + [lstm_node], "test", [input_tensor], [out_tensor], initializer=[w_init, r_init, b_init] + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 11)]) + model.ir_version = 7 # use stable onnx ir version + + q_config = {"lstm": self.q_config} + q_model = self.dynamic_test(model, q_config, None, ["LSTM"]) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLSTM"], 1 + ) + + def test_concat_reshape_pooling(self): + model = build_model() + + q_config = { + "Reshape": self.q_config, + "conv1": self.q_config, + "conv2": self.q_config, + "Concat": self.q_config, + "AveragePool": self.q_config, + "add": self.q_config, + } + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "conv1_weight": [np.uint8(10.0), np.float32(0)], + "conv1_output": [np.uint8(10.0), np.float32(0)], + "conv2_weight": [np.uint8(10.0), np.float32(0)], + "conv2_output": [np.uint8(10.0), np.float32(0)], + "concat_output": [np.uint8(10.0), np.float32(0)], + "avg_output": [np.uint8(10.0), np.float32(0)], + "add_out": [np.uint8(10.0), np.float32(0)], + "add_init": [np.uint8(10.0), np.float32(0)], + "shape": [np.uint8(10.0), np.float32(0)], + "reshape_output": [np.uint8(10.0), np.float32(0)], + "add_init_2": [np.uint8(10.0), np.float32(0)], + "add_out_2": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["Reshape", "Conv", "Concat", "AveragePool", "Add"] + q_model = self.qlinear_test( + model, q_config, quantize_params, quantizable_op_types, **{"dedicated_qdq_pair": True} + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types, **{"dedicated_qdq_pair": True}) + q_model.save("test.onnx") + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 7) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 9 + ) + + q_config = { + "Reshape": self.q_config, + "conv1": "fp32", + "conv2": self.q_config, + "Concat": self.q_config, + "AveragePool": self.q_config, + } + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3 + ) + + q_config = { + "Reshape": self.q_config, + "conv1": "fp32", + "conv2": "fp32", + "Concat": self.q_config, + "AveragePool": self.q_config, + } + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + q_config = { + "Reshape": self.q_config, + "conv1": self.q_config, + "conv2": self.q_config, + "Concat": self.q_config, + "AveragePool": "fp32", + } + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["AveragePool"], 1) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) + + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "conv1_weight": [np.uint8(10.0), np.float32(0)], + "conv1_output": [np.uint8(10.0), np.float32(0)], + "conv2_weight": [np.uint8(10.0), np.float32(0)], + "conv2_output": [np.uint8(10.0), np.float32(0)], + "concat_output": [np.uint8(10.0), np.float32(0)], + "avg_output": [np.uint8(10.0), np.float32(0)], + "shape": [np.uint8(10.0), np.float32(0)], + "add_out": [np.uint8(10.0), np.float32(0)], + "add_init": [np.uint8(10.0), np.float32(0)], + "reshape_output": [np.uint8(10.0), np.float32(0)], + } + q_config = { + "Reshape": self.q_config, + "conv1": self.q_config, + "conv2": self.q_config, + "Concat": self.q_config, + "AveragePool": self.q_config, + } + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["Add"], 2) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 6) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 8 + ) + + def test_conv(self): + for op in ["Conv", "FusedConv"]: + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 5, 5, 1]) + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 3, 3, 1]) + C = onnx.helper.make_tensor( + "C", onnx.TensorProto.FLOAT, [1, 5, 5, 1], np.random.random((1, 5, 5, 1)).reshape(25).tolist() + ) + D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 1]) + conv_node = onnx.helper.make_node( + op, ["A", "B", "C"], ["D"], name=op, kernel_shape=[3, 3], pads=[1, 1, 1, 1] + ) + initializers = [C] + graph = onnx.helper.make_graph([conv_node], "test_graph_1", [A, B], [D], initializer=initializers) + model = onnx.helper.make_model(graph) + q_config = {op: self.q_config} + quantize_params = { + "A": [np.uint8(10.0), np.float32(0)], + "B": [np.uint8(10.0), np.float32(0)], + "C": [np.uint8(10.0), np.float32(0)], + "D": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["Conv"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2 + ) + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3 + ) + + def test_matmul(self): + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + B_init = onnx.helper.make_tensor( + "B", onnx.TensorProto.FLOAT, [1, 1, 5, 1], np.random.random((1, 1, 5, 1)).reshape(5).tolist() + ) + C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [1, 1, 5, 1]) + matmul_node = onnx.helper.make_node("MatMul", ["A", "B"], ["C"], name="Matmul") + graph = onnx.helper.make_graph([matmul_node], "test_graph_1", [A], [C], [B_init]) + model = onnx.helper.make_model(graph) + q_config = {"Matmul": self.q_config} + quantize_params = { + "A": [np.uint8(10.0), np.float32(0)], + "B": [np.uint8(10.0), np.float32(0)], + "C": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["MatMul"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + + q_config = {"Matmul": self.q_config} + q_model = self.dynamic_test(model, q_config, None, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["MatMulInteger"], 1) + + quantize_params = {"A": [np.float32(10.0)], "B": [np.float32(10.0)], "C": [np.float32(10.0)]} + with self.assertRaises(ValueError): + self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + with self.assertRaises(ValueError): + self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + + quantize_params = {} + q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["MatMulInteger"], 1) + + def test_attention(self): + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + node = onnx.helper.make_node("Attention", ["A", "B", "C"], ["D"], name="Attention") + graph = onnx.helper.make_graph([node], "test_graph_1", [A, B, C], [D]) + model = onnx.helper.make_model(graph) + q_config = {"Attention": self.q_config} + quantize_params = { + "A": [np.uint8(0), np.float32(0.5)], + "B": [np.uint8(0), np.float32(0.5)], + "C": [np.uint8(0), np.float32(0.5)], + "D": [np.uint8(0), np.float32(0.5)], + } + quantizable_op_types = ["Attention"] + + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QAttention"], 1) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + q_config = {"Attention": self.q_config} + q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 2 + ) + + E = onnx.helper.make_tensor_value_info("E", onnx.TensorProto.INT32, [1, 1, 5, 5]) + F = onnx.helper.make_tensor_value_info("F", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + node = onnx.helper.make_node("Attention", ["A", "B", "C", "F", "E"], ["D"], name="Attention") + graph = onnx.helper.make_graph([node], "test_graph_1", [A, B, C, F, E], [D]) + model = onnx.helper.make_model(graph) + q_config = {"Attention": self.q_config} + quantize_params = { + "A": [np.uint8(0), np.float32(0.5)], + "B": [np.uint8(0), np.float32(0.5)], + "C": [np.uint8(0), np.float32(0.5)], + "D": [np.uint8(0), np.float32(0.5)], + } + quantizable_op_types = ["Attention"] + + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) + + q_config = {"Attention": self.q_config} + q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 2 + ) + + def test_gather(self): + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [3, 2]) + + matmul_weight = onnx.helper.make_tensor( + "matmul_weight", onnx.TensorProto.FLOAT, [2, 3], np.random.random((2, 3)).reshape(6).tolist() + ) + matmul_output = onnx.helper.make_tensor_value_info("matmul_output", onnx.TensorProto.FLOAT, [3, 3]) + matmul_node = onnx.helper.make_node("MatMul", ["input", "matmul_weight"], ["matmul_output"], name="MatMul") + + gather_indices = onnx.helper.make_tensor("gather_indices", onnx.TensorProto.INT64, [1, 2], [0, 2]) + gather_output = onnx.helper.make_tensor_value_info("gather_output", onnx.TensorProto.FLOAT, [1, 2, 3]) + gather_node = onnx.helper.make_node( + "Gather", ["matmul_output", "gather_indices"], ["gather_output"], name="Gather" + ) + + initializers = [matmul_weight, gather_indices] + graph = onnx.helper.make_graph( + [matmul_node, gather_node], + "TestGather_test_model", + [input_tensor], + [gather_output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 + + q_config = {"Gather": self.q_config, "MatMul": self.q_config} + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "matmul_weight": [np.uint8(10.0), np.float32(0)], + "matmul_output": [np.uint8(10.0), np.float32(0)], + "gather_output": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["Gather", "MatMul"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) + + q_config = {"Gather": self.q_config, "MatMul": self.q_config} + q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(len(q_model.model.graph.node), 6) + + def test_split(self): + D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [100, 2]) + e_value = np.random.randn(2, 2).astype(np.float32) + E_init = onnx.helper.make_tensor("E", onnx.TensorProto.FLOAT, [2, 2], e_value.reshape(4).tolist()) + + matmul_node = onnx.helper.make_node("MatMul", ["D", "E"], ["A"], name="Matmul") + + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [50, 2]) + C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [50, 2]) + node = onnx.helper.make_node("Split", ["A"], ["B", "C"], name="Split", **{"num_outputs": 2}) + graph = onnx.helper.make_graph([matmul_node, node], "test_graph_1", [D], [B, C], [E_init]) + model = onnx.helper.make_model(graph) + q_config = { + "Split": self.q_config, + "Matmul": { + "weight_type": 3, + "activation_type": 2, + "per_channel": False, + "weight_sym": True, + "activation_sym": False, + "calibrate_method": quantization.CalibrationMethod.MinMax, + }, + } + quantize_params = { + "A": [np.uint8(0), np.float32(0.5)], + "B": [np.uint8(0), np.float32(0.5)], + "C": [np.uint8(0), np.float32(0.5)], + "D": [np.uint8(0), np.float32(0.5)], + "E": [np.uint8(0), np.float32(0.5)], + } + quantizable_op_types = ["Split", "MatMul"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 5 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + + def test_pad(self): + b_value = np.array([0, 1, 1, 0, 1, 1]).astype(np.int64) + B_init = onnx.helper.make_tensor("B", onnx.TensorProto.INT64, [6], b_value.reshape(6).tolist()) + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.INT64, [6]) + C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [1, 7, 7]) + + d_value = np.random.randn(1).astype(np.float32) + D_init = onnx.helper.make_tensor("D", onnx.TensorProto.FLOAT, [1], d_value.reshape(1).tolist()) + D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1]) + + e_value = np.random.randn(1, 5, 5).astype(np.float32) + E_init = onnx.helper.make_tensor("E", onnx.TensorProto.FLOAT, [1, 1, 5, 5], e_value.reshape(25).tolist()) + E = onnx.helper.make_tensor_value_info("E", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + f_value = np.random.randn(1, 3, 3).astype(np.float32) + F_init = onnx.helper.make_tensor("F", onnx.TensorProto.FLOAT, [1, 1, 3, 3], f_value.reshape(9).tolist()) + F = onnx.helper.make_tensor_value_info("F", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) + for mode in ["constant", "edge", "reflect", "constant_value"]: + conv_node = onnx.helper.make_node( + "Conv", ["E", "F"], ["A"], name="Conv", kernel=[3, 3], padding=[1, 1, 1, 1] + ) + if mode == "constant_value": + node = onnx.helper.make_node("Pad", ["A", "B", "D"], ["C"], name="Pad", mode="constant") + graph = onnx.helper.make_graph( + [conv_node, node], "test_graph_1", [E, F, B, D], [C], [E_init, F_init, B_init, D_init] + ) + else: + node = onnx.helper.make_node("Pad", ["A", "B"], ["C"], name="Pad", mode=mode) + graph = onnx.helper.make_graph( + [conv_node, node], "test_graph_1", [E, F, B], [C], [E_init, F_init, B_init] + ) + model = onnx.helper.make_model(graph) + conv_config = { + "weight_type": 3, + "activation_type": 2, + "per_channel": True, + "weight_sym": True, + "activation_sym": False, + "calibrate_method": quantization.CalibrationMethod.MinMax, + } + q_config = {"Conv": conv_config, "Pad": self.q_config} + quantize_params = { + "A": [np.uint8(10.0), np.float32(1)], + "C": [np.uint8(10.0), np.float32(1)], + "D": [np.uint8(10.0), np.float32(1)], + "E": [np.uint8(10.0), np.float32(1)], + "F": [np.uint8(10.0), np.float32(1)], + } + quantizable_op_types = ["Conv", "Pad"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + q_model = self.qdq_test( + model, q_config, quantize_params, quantizable_op_types, **{"dedicated_qdq_pair": True} + ) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) + + node = onnx.helper.make_node("Pad", ["E", "B", "D"], ["C"], name="Pad", mode="constant") + graph = onnx.helper.make_graph([node], "test_graph_1", [E, B, D], [C], [E_init, B_init, D_init]) + model = onnx.helper.make_model(graph) + quantize_params = {"C": [np.uint8(10.0), np.float32(0)], "E": [np.uint8(10.0), np.float32(0)]} + quantizable_op_types = ["Pad"] + q_config = {"Pad": self.q_config} + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) + + def test_binary(self): + for op in ["Mul", "Add"]: + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 10]) + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1]) + C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [1, 10]) + node = onnx.helper.make_node(op, ["A", "B"], ["C"], name=op) + graph = onnx.helper.make_graph([node], "test_graph_1", [A, B], [C]) + model = onnx.helper.make_model(graph) + q_config = {op: self.q_config} + quantize_params = { + "A": [np.uint8(10.0), np.float32(0)], + "B": [np.uint8(10.0), np.float32(0)], + "C": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = [op] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + q_model = self.qdq_test(model, q_config, {}, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + def test_relu(self): + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) + D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + E = onnx.helper.make_tensor_value_info("E", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + F = onnx.helper.make_tensor_value_info("F", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + + conv_node = onnx.helper.make_node( + "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1] + ) + relu_node = onnx.helper.make_node("Relu", ["C"], ["D"], name="Relu") + add_node = onnx.helper.make_node("Add", ["D", "E"], ["F"], name="Add") + graph = onnx.helper.make_graph([conv_node, relu_node], "test_graph_1", [A, B], [D]) + model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]}) + sess_options = ort.SessionOptions() + sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED + sess_options.optimized_model_filepath = "./onnxrt_test/optimized_model.onnx" + session = ort.InferenceSession(model.SerializeToString(), sess_options, providers=ort.get_available_providers()) + tmp_model = onnx.load(sess_options.optimized_model_filepath) + + q_config = {"Conv": self.q_config, "Relu": self.q_config} + quantize_params = { + "A": [np.uint8(10.0), np.float32(0)], + "B": [np.uint8(10.0), np.float32(0)], + "C": [np.uint8(10.0), np.float32(0)], + "D": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["Conv", "Relu"] + q_model = self.qlinear_test(tmp_model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(len(q_model.model.graph.node), 4) + q_model = self.qdq_test(tmp_model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(len(q_model.model.graph.node), 7) + + sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC + session = ort.InferenceSession(model.SerializeToString(), sess_options, providers=ort.get_available_providers()) + tmp_model = onnx.load(sess_options.optimized_model_filepath) + q_model = self.qlinear_test(tmp_model, q_config, quantize_params, quantizable_op_types) + q_model.save("test.onnx") + self.assertEqual(len(q_model.model.graph.node), 5) + q_model = self.qdq_test(tmp_model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(len(q_model.model.graph.node), 8) + + graph = onnx.helper.make_graph([conv_node, relu_node, add_node], "test_graph_2", [A, B, E], [F]) + model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]}) + sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC + session = ort.InferenceSession(model.SerializeToString(), sess_options, providers=ort.get_available_providers()) + tmp_model = onnx.load(sess_options.optimized_model_filepath) + q_model = self.qlinear_test(tmp_model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(len(q_model.model.graph.node), 5) + q_model = self.qdq_test(tmp_model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(len(q_model.model.graph.node), 8) + + def test_clip(self): + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) + D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + conv_node = onnx.helper.make_node( + "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1] + ) + clip_node = onnx.helper.make_node("Clip", ["C"], ["D"], name="Clip") + graph = onnx.helper.make_graph([conv_node, clip_node], "test_graph_1", [A, B], [D]) + model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]}) + + sess_options = ort.SessionOptions() + sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED + sess_options.optimized_model_filepath = "./onnxrt_test/optimized_model.onnx" + session = ort.InferenceSession(model.SerializeToString(), sess_options, providers=ort.get_available_providers()) + model = onnx.load(sess_options.optimized_model_filepath) + + q_config = {"Conv": self.q_config, "Clip": self.q_config} + quantize_params = { + "A": [np.uint8(10.0), np.float32(0)], + "B": [np.uint8(10.0), np.float32(0)], + "C": [np.uint8(10.0), np.float32(0)], + "D": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["Conv", "Clip"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3) + + def test_activation(self): + for op in ["LeakyRelu", "Sigmoid"]: + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 10]) + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 10]) + node = onnx.helper.make_node(op, ["A"], ["B"], name=op) + graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B]) + model = onnx.helper.make_model(graph) + q_config = {op: self.q_config} + quantize_params = {"A": [np.uint8(10.0), np.float32(0)], "B": [np.uint8(10.0), np.float32(0)]} + quantizable_op_types = [op] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) + + a_value = np.random.randn(1, 10).astype(np.float32) + A_init = onnx.helper.make_tensor("A", onnx.TensorProto.FLOAT, [1, 10], a_value.reshape(10).tolist()) + graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B], [A_init]) + model = onnx.helper.make_model(graph) + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) + + q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + q_model = self.qdq_test(model, q_config, {}, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + for op in ["Relu"]: + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 10]) + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 10]) + node = onnx.helper.make_node(op, ["A"], ["B"], name=op) + graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B]) + model = onnx.helper.make_model(graph) + q_config = {op: self.q_config} + quantize_params = {"A": [np.uint8(10.0), np.float32(0)], "B": [np.uint8(10.0), np.float32(0)]} + quantizable_op_types = [op] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + a_value = np.random.randn(1, 10).astype(np.float32) + A_init = onnx.helper.make_tensor("A", onnx.TensorProto.FLOAT, [1, 10], a_value.reshape(10).tolist()) + graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B], [A_init]) + model = onnx.helper.make_model(graph) + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + q_model = self.qdq_test(model, q_config, {}, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) + + def test_pooling(self): + op = "MaxPool" + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 5, 5, 1]) + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 5, 5, 1]) + node = onnx.helper.make_node(op, ["A"], ["B"], name=op, kernel_shape=[3, 3], pads=[1, 1, 1, 1]) + graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B]) + q_config = {op: self.q_config} + quantize_params = {"A": [np.uint8(10.0), np.float32(0)], "B": [np.uint8(10.0), np.float32(0)]} + quantizable_op_types = [op] + for opset_version in [12, 13]: + opset = onnx.OperatorSetIdProto() + opset.version = opset_version + model = onnx.helper.make_model(graph, opset_imports=[opset]) + self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) + D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + conv_node = onnx.helper.make_node( + "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1] + ) + pool_node = onnx.helper.make_node(op, ["C"], ["D"], name=op) + graph = onnx.helper.make_graph([conv_node, pool_node], "test_graph_1", [A, B], [D]) + model = onnx.helper.make_model(graph) + + q_config = {"Conv": self.q_config, op: self.q_config} + quantize_params = { + "A": [np.uint8(10.0), np.float32(0)], + "B": [np.uint8(10.0), np.float32(0)], + "C": [np.uint8(10.0), np.float32(0)], + "D": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["Conv", op] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + + op = "GlobalAveragePool" + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 5, 1, 1]) + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 5, 5, 1]) + node = onnx.helper.make_node(op, ["A"], ["B"], name=op, kernel_shape=[3, 3], pads=[1, 1, 1, 1]) + graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B]) + q_config = {op: self.q_config} + quantize_params = {"A": [np.uint8(10.0), np.float32(0)], "B": [np.uint8(10.0), np.float32(0)]} + quantizable_op_types = [op] + for opset_version in [12, 13]: + opset = onnx.OperatorSetIdProto() + opset.version = opset_version + model = onnx.helper.make_model(graph, opset_imports=[opset]) + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1 + ) + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2 + ) + + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) + D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 1, 1]) + conv_node = onnx.helper.make_node( + "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1] + ) + pool_node = onnx.helper.make_node(op, ["C"], ["D"], name=op) + graph = onnx.helper.make_graph([conv_node, pool_node], "test_graph_1", [A, B], [D]) + model = onnx.helper.make_model(graph) + + q_config = {"Conv": self.q_config, op: self.q_config} + quantize_params = { + "A": [np.uint8(10.0), np.float32(0)], + "B": [np.uint8(10.0), np.float32(0)], + "C": [np.uint8(10.0), np.float32(0)], + "D": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["Conv", op] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + + def test_exclude_node(self): + A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 5, 5, 1]) + B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [3, 3, 1, 1]) + D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) + conv_node = onnx.helper.make_node( + "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1] + ) + pool_node = onnx.helper.make_node("MaxPool", ["C"], ["D"], name="MaxPool") + graph = onnx.helper.make_graph([conv_node, pool_node], "test_graph_1", [A, B], [D]) + model = onnx.helper.make_model(graph) + + q_config = {"Conv": self.q_config, "MaxPool": "fp32"} + quantize_params = { + "A": [np.uint8(10.0), np.float32(0)], + "B": [np.uint8(10.0), np.float32(0)], + "C": [np.uint8(10.0), np.float32(0)], + "D": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["Conv", "MaxPool"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.save("int8.onnx") + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3) + + def test_more_direct8bit_nodes(self): + # test direct q8 nodes: MatMul-Flatten-Abs-Sign-ShrinK-MatMul + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 32]) + + matmul1_weight = onnx.helper.make_tensor( + "matmul1_weight", onnx.TensorProto.FLOAT, [32, 64], np.random.random((32, 64)).reshape(2048).tolist() + ) + matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [1, 64]) + matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") + + flatten_output = onnx.helper.make_tensor_value_info("flatten_output", onnx.TensorProto.FLOAT, [1, 64]) + flatten_node = onnx.helper.make_node( + "Flatten", inputs=["matmul1_output"], outputs=["flatten_output"], axis=1, name="Flatten_1" + ) + + abs_output = onnx.helper.make_tensor_value_info("abs_output", onnx.TensorProto.FLOAT, [1, 64]) + abs_node = onnx.helper.make_node("Abs", inputs=["flatten_output"], outputs=["abs_output"], name="Abs_2") + + sign_output = onnx.helper.make_tensor_value_info("sign_output", onnx.TensorProto.FLOAT, [1, 64]) + sign_node = onnx.helper.make_node("Sign", inputs=["abs_output"], outputs=["sign_output"], name="Sign_3") + + shrink_output = onnx.helper.make_tensor_value_info("shrink_output", onnx.TensorProto.FLOAT, [1, 64]) + shrink_node = onnx.helper.make_node( + "Shrink", inputs=["sign_output"], outputs=["shrink_output"], name="Shrink_4" + ) + + matmul2_weight = onnx.helper.make_tensor( + "matmul2_weight", onnx.TensorProto.FLOAT, [64, 2], np.random.random((64, 2)).reshape(128).tolist() + ) + matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [1, 2]) + matmul2_node = onnx.helper.make_node( + "MatMul", ["shrink_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_5" + ) + + initializers = [matmul1_weight, matmul2_weight] + graph = onnx.helper.make_graph( + [matmul1_node, flatten_node, abs_node, sign_node, shrink_node, matmul2_node], + "TestMoreDirect8_test_model", + [input_tensor], + [matmul2_output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 + + q_config = { + "Matmul_0": self.q_config, + "Flatten_1": self.q_config, + "Abs_2": self.q_config, + "Sign_3": self.q_config, + "Shrink_4": self.q_config, + "Matmul_5": self.q_config, + } + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "matmul1_weight": [np.uint8(10.0), np.float32(0)], + "matmul1_output": [np.uint8(10.0), np.float32(0)], + "flatten_output": [np.uint8(10.0), np.float32(0)], + "abs_output": [np.uint8(10.0), np.float32(0)], + "sign_output": [np.uint8(10.0), np.float32(0)], + "shrink_output": [np.uint8(10.0), np.float32(0)], + "matmul2_weight": [np.uint8(10.0), np.float32(0)], + "matmul2_output": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["MatMul", "Flatten", "Abs", "Sign", "Shrink"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + q_model.save("qdq.onnx") + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 9 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 7) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + def test_expand(self): + # test expand nodes: MatMul-Expand-MatMul + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [3, 2]) + + matmul1_weight = onnx.helper.make_tensor( + "matmul1_weight", onnx.TensorProto.FLOAT, [2, 1], np.random.random((2, 1)).reshape(2).tolist() + ) + matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [3, 1]) + matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") + + expand_new_shape = onnx.helper.make_tensor("expand_new_shape", onnx.TensorProto.INT64, [2], [3, 4]) + expand_output = onnx.helper.make_tensor_value_info("expand_output", onnx.TensorProto.FLOAT, [3, 4]) + expand_node = onnx.helper.make_node( + "Expand", ["matmul1_output", "expand_new_shape"], ["expand_output"], name="Expand_1" + ) + + matmul2_weight = onnx.helper.make_tensor( + "matmul2_weight", onnx.TensorProto.FLOAT, [4, 2], np.random.random((4, 2)).reshape(8).tolist() + ) + matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [3, 2]) + matmul2_node = onnx.helper.make_node( + "MatMul", ["expand_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2" + ) + + initializers = [matmul1_weight, matmul2_weight, expand_new_shape] + graph = onnx.helper.make_graph( + [matmul1_node, expand_node, matmul2_node], + "TestExpand_test_model", + [input_tensor], + [matmul2_output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 + + q_config = { + "Matmul_0": self.q_config, + "Expand_1": self.q_config, + "Matmul_2": self.q_config, + } + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "matmul1_weight": [np.uint8(10.0), np.float32(0)], + "matmul1_output": [np.uint8(10.0), np.float32(0)], + "matmul2_weight": [np.uint8(10.0), np.float32(0)], + "matmul2_output": [np.uint8(10.0), np.float32(0)], + "expand_output": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["MatMul", "Expand"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + def test_slice(self): + # test slice nodes: MatMul-Slice-MatMul + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [5, 4, 1]) + + matmul1_weight = onnx.helper.make_tensor( + "matmul1_weight", onnx.TensorProto.FLOAT, [1, 3], np.random.random((1, 3)).reshape(3).tolist() + ) + matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [5, 4, 3]) + matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") + + slice_starts = onnx.helper.make_tensor("slice_starts", onnx.TensorProto.INT64, [2], [0, 0]) + slice_ends = onnx.helper.make_tensor("slice_ends", onnx.TensorProto.INT64, [2], [3, 4]) + slice_axes = onnx.helper.make_tensor("slice_axes", onnx.TensorProto.INT64, [2], [0, 1]) + slice_steps = onnx.helper.make_tensor("slice_steps", onnx.TensorProto.INT64, [2], [1, 1]) + slice_output = onnx.helper.make_tensor_value_info("slice_output", onnx.TensorProto.FLOAT, [3, 4, 3]) + slice_node = onnx.helper.make_node( + "Slice", + ["matmul1_output", "slice_starts", "slice_ends", "slice_axes", "slice_steps"], + ["slice_output"], + name="Slice_1", + ) + + matmul2_weight = onnx.helper.make_tensor( + "matmul2_weight", onnx.TensorProto.FLOAT, [3, 2], np.random.random((3, 2)).reshape(6).tolist() + ) + matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [3, 4, 2]) + matmul2_node = onnx.helper.make_node( + "MatMul", ["slice_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2" + ) + + initializers = [matmul1_weight, matmul2_weight, slice_starts, slice_ends, slice_axes, slice_steps] + graph = onnx.helper.make_graph( + [matmul1_node, slice_node, matmul2_node], + "TestSlice_test_model", + [input_tensor], + [matmul2_output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 + + q_config = {"Matmul_0": self.q_config, "Slice_1": self.q_config, "Matmul_2": self.q_config} + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "matmul1_weight": [np.uint8(10.0), np.float32(0)], + "matmul1_output": [np.uint8(10.0), np.float32(0)], + "matmul2_weight": [np.uint8(10.0), np.float32(0)], + "matmul2_output": [np.uint8(10.0), np.float32(0)], + "slice_output": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["MatMul", "Slice"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + def test_mod(self): + # test mode nodes: MatMul-Mod-MatMul + # MatMul-/ + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [2, 3]) + + matmul1_weight = onnx.helper.make_tensor( + "matmul1_weight", onnx.TensorProto.FLOAT, [3, 4], np.random.random((3, 4)).reshape(12).tolist() + ) + matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [2, 4]) + matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") + + matmul2_weight = onnx.helper.make_tensor( + "matmul2_weight", onnx.TensorProto.FLOAT, [3, 4], np.random.random((3, 4)).reshape(12).tolist() + ) + matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [2, 4]) + matmul2_node = onnx.helper.make_node("MatMul", ["input", "matmul2_weight"], ["matmul2_output"], name="Matmul_1") + + mod_output = onnx.helper.make_tensor_value_info("mod_output", onnx.TensorProto.FLOAT, [2, 4]) + mod_node = onnx.helper.make_node("Mod", ["matmul1_output", "matmul2_output"], ["mod_output"], name="Mod_2") + + matmul3_weight = onnx.helper.make_tensor( + "matmul3_weight", onnx.TensorProto.FLOAT, [4, 2], np.random.random((4, 2)).reshape(8).tolist() + ) + matmul3_output = onnx.helper.make_tensor_value_info("matmul3_output", onnx.TensorProto.FLOAT, [2, 2]) + matmul3_node = onnx.helper.make_node( + "MatMul", ["mod_output", "matmul3_weight"], ["matmul3_output"], name="Matmul_3" + ) + + initializers = [matmul1_weight, matmul2_weight, matmul3_weight] + graph = onnx.helper.make_graph( + [matmul1_node, matmul2_node, mod_node, matmul3_node], + "TestMod_test_model", + [input_tensor], + [matmul3_output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 14)]) + model.ir_version = 7 + + q_config = { + "Matmul_0": self.q_config, + "Matmul_1": self.q_config, + "Mod_2": self.q_config, + "Matmul_3": self.q_config, + } + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "matmul1_weight": [np.uint8(10.0), np.float32(0)], + "matmul1_output": [np.uint8(10.0), np.float32(0)], + "matmul2_weight": [np.uint8(10.0), np.float32(0)], + "matmul2_output": [np.uint8(10.0), np.float32(0)], + "mod_output": [np.uint8(10.0), np.float32(0)], + "matmul3_weight": [np.uint8(10.0), np.float32(0)], + "matmul3_output": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["MatMul", "Mod"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + q_model.save("test.onnx") + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 8 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 5) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + def test_reducemin_reducemax(self): + # MatMul-ReduceMin-MatMul + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [3, 2, 3]) + + matmul1_weight = onnx.helper.make_tensor( + "matmul1_weight", onnx.TensorProto.FLOAT, [3, 2], np.random.random((3, 2)).reshape(6).tolist() + ) + matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [3, 2, 2]) + matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") + + reducemin_output = onnx.helper.make_tensor_value_info("reducemin_output", onnx.TensorProto.FLOAT, [3, 1, 2]) + reducemin_node = onnx.helper.make_node( + "ReduceMin", + inputs=["matmul1_output"], + outputs=["reducemin_output"], + axes=[1], + keepdims=1, + name="Reducemin_1", + ) + + matmul2_weight = onnx.helper.make_tensor( + "matmul2_weight", onnx.TensorProto.FLOAT, [2, 3], np.random.random((2, 3)).reshape(6).tolist() + ) + matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [3, 1, 3]) + matmul2_node = onnx.helper.make_node( + "MatMul", ["reducemin_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2" + ) + + initializers = [matmul1_weight, matmul2_weight] + graph = onnx.helper.make_graph( + [matmul1_node, reducemin_node, matmul2_node], + "TestReduceMin_test_model", + [input_tensor], + [matmul2_output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 + + q_config = { + "Matmul_0": self.q_config, + "Reducemin_1": self.q_config, + "Matmul_2": self.q_config, + } + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "matmul1_weight": [np.uint8(10.0), np.float32(0)], + "matmul1_output": [np.uint8(10.0), np.float32(0)], + "reducemin_output": [np.uint8(10.0), np.float32(0)], + "matmul2_weight": [np.uint8(10.0), np.float32(0)], + "matmul2_output": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["MatMul", "ReduceMin"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + # MatMul-ReduceMax-MatMul + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [3, 2, 3]) + + matmul1_weight = onnx.helper.make_tensor( + "matmul1_weight", onnx.TensorProto.FLOAT, [3, 2], np.random.random((3, 2)).reshape(6).tolist() + ) + matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [3, 2, 2]) + matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") + + reducemax_output = onnx.helper.make_tensor_value_info("reducemax_output", onnx.TensorProto.FLOAT, [3, 1, 2]) + reducemax_node = onnx.helper.make_node( + "ReduceMax", + inputs=["matmul1_output"], + outputs=["reducemax_output"], + axes=[1], + keepdims=1, + name="Reducemax_1", + ) + + matmul2_weight = onnx.helper.make_tensor( + "matmul2_weight", onnx.TensorProto.FLOAT, [2, 3], np.random.random((2, 3)).reshape(6).tolist() + ) + matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [3, 1, 3]) + matmul2_node = onnx.helper.make_node( + "MatMul", ["reducemax_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2" + ) + + initializers = [matmul1_weight, matmul2_weight] + graph = onnx.helper.make_graph( + [matmul1_node, reducemax_node, matmul2_node], + "TestReduceMax_test_model", + [input_tensor], + [matmul2_output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 + + q_config = { + "Matmul_0": self.q_config, + "Reducemax_1": self.q_config, + "Matmul_2": self.q_config, + } + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "matmul1_weight": [np.uint8(10.0), np.float32(0)], + "matmul1_output": [np.uint8(10.0), np.float32(0)], + "reducemax_output": [np.uint8(10.0), np.float32(0)], + "matmul2_weight": [np.uint8(10.0), np.float32(0)], + "matmul2_output": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["MatMul", "ReduceMax"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + def test_tile(self): + # test Tile nodes: MatMul-Tile-MatMul + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [2, 3, 4, 1]) + + matmul1_weight = onnx.helper.make_tensor( + "matmul1_weight", onnx.TensorProto.FLOAT, [1, 5], np.random.random((1, 5)).reshape(5).tolist() + ) + matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [2, 3, 4, 5]) + matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") + + repeats = onnx.helper.make_tensor("repeats", onnx.TensorProto.INT64, [4], [2, 2, 2, 2]) + tile_output = onnx.helper.make_tensor_value_info("tile_output", onnx.TensorProto.FLOAT, [4, 6, 8, 10]) + tile_node = onnx.helper.make_node( + "Tile", + ["matmul1_output", "repeats"], + ["tile_output"], + name="Tile_1", + ) + + matmul2_weight = onnx.helper.make_tensor( + "matmul2_weight", onnx.TensorProto.FLOAT, [10, 1], np.random.random((10, 1)).reshape(10).tolist() + ) + matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [4, 6, 8, 1]) + matmul2_node = onnx.helper.make_node( + "MatMul", ["tile_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2" + ) + + initializers = [matmul1_weight, matmul2_weight, repeats] + graph = onnx.helper.make_graph( + [matmul1_node, tile_node, matmul2_node], + "TestTile_test_model", + [input_tensor], + [matmul2_output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 + + q_config = {"Matmul_0": self.q_config, "Tile_1": self.q_config, "Matmul_2": self.q_config} + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "matmul1_weight": [np.uint8(10.0), np.float32(0)], + "matmul1_output": [np.uint8(10.0), np.float32(0)], + "matmul2_weight": [np.uint8(10.0), np.float32(0)], + "matmul2_output": [np.uint8(10.0), np.float32(0)], + "tile_output": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["MatMul", "Tile"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + def test_centercroppad(self): + # test CenterCropPad nodes: MatMul-CenterCropPad-MatMul + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [20, 10, 1]) + + matmul1_weight = onnx.helper.make_tensor( + "matmul1_weight", onnx.TensorProto.FLOAT, [1, 3], np.random.random((1, 3)).reshape(3).tolist() + ) + matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [20, 10, 3]) + matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") + + centercroppad_output = onnx.helper.make_tensor_value_info( + "centercroppad_output", onnx.TensorProto.FLOAT, [10, 7, 3] + ) + shape = onnx.helper.make_tensor("shape", onnx.TensorProto.INT64, [3], [10, 7, 3]) + centercroppad_node = onnx.helper.make_node( + "CenterCropPad", + ["matmul1_output", "shape"], + ["centercroppad_output"], + name="Centercroppad_1", + ) + + matmul2_weight = onnx.helper.make_tensor( + "matmul2_weight", onnx.TensorProto.FLOAT, [3, 1], np.random.random((3, 1)).reshape(3).tolist() + ) + matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [10, 7, 1]) + matmul2_node = onnx.helper.make_node( + "MatMul", ["centercroppad_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2" + ) + + initializers = [matmul1_weight, shape, matmul2_weight] + graph = onnx.helper.make_graph( + [matmul1_node, centercroppad_node, matmul2_node], + "TestCenterCropPad_test_model", + [input_tensor], + [matmul2_output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 18)]) + model.ir_version = 8 + + q_config = { + "Matmul_0": self.q_config, + "Centercroppad_1": self.q_config, + "Matmul_2": self.q_config, + } + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "matmul1_weight": [np.uint8(10.0), np.float32(0)], + "matmul1_output": [np.uint8(10.0), np.float32(0)], + "matmul2_weight": [np.uint8(10.0), np.float32(0)], + "matmul2_output": [np.uint8(10.0), np.float32(0)], + "centercroppad_output": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["MatMul", "CenterCropPad"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + def test_gathernd(self): + # test GatherND nodes: MatMul-GatherND-MatMul + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [2, 2, 1]) + + matmul1_weight = onnx.helper.make_tensor( + "matmul1_weight", onnx.TensorProto.FLOAT, [1, 2], np.random.random((1, 2)).reshape(2).tolist() + ) + matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [2, 2, 2]) + matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") + + gathernd_output = onnx.helper.make_tensor_value_info("gathernd_output", onnx.TensorProto.FLOAT, [2, 1, 2]) + indices = onnx.helper.make_tensor("indices", onnx.TensorProto.INT64, [2, 1, 2], [0, 1, 1, 0]) + gathernd_node = onnx.helper.make_node( + "GatherND", + ["matmul1_output", "indices"], + ["gathernd_output"], + name="Gathernd_1", + ) + + matmul2_weight = onnx.helper.make_tensor( + "matmul2_weight", onnx.TensorProto.FLOAT, [2, 1], np.random.random((2, 1)).reshape(2).tolist() + ) + matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [2, 1, 1]) + matmul2_node = onnx.helper.make_node( + "MatMul", ["gathernd_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2" + ) + + initializers = [matmul1_weight, indices, matmul2_weight] + graph = onnx.helper.make_graph( + [matmul1_node, gathernd_node, matmul2_node], + "TestGatherND_test_model", + [input_tensor], + [matmul2_output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 + + q_config = { + "Matmul_0": self.q_config, + "Matmul_2": self.q_config, + "Gathernd_1": self.q_config, + } + + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "matmul1_weight": [np.uint8(10.0), np.float32(0)], + "matmul1_output": [np.uint8(10.0), np.float32(0)], + "matmul2_weight": [np.uint8(10.0), np.float32(0)], + "matmul2_output": [np.uint8(10.0), np.float32(0)], + "gathernd_output": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["MatMul", "GatherND"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + def test_gatherelements(self): + # test GatherElements nodes: MatMul-GatherElements-MatMul + input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [3, 1]) + + matmul1_weight = onnx.helper.make_tensor( + "matmul1_weight", onnx.TensorProto.FLOAT, [1, 3], np.random.random((1, 3)).reshape(3).tolist() + ) + matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [3, 3]) + matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") + + gatherelements_output = onnx.helper.make_tensor_value_info( + "gatherelements_output", onnx.TensorProto.FLOAT, [2, 3] + ) + indices = onnx.helper.make_tensor("indices", onnx.TensorProto.INT64, [2, 3], [-1, -2, 0, -2, 0, 0]) + gathernd_node = onnx.helper.make_node( + "GatherElements", + ["matmul1_output", "indices"], + ["gatherelements_output"], + name="Gatherelements_1", + ) + + matmul2_weight = onnx.helper.make_tensor( + "matmul2_weight", onnx.TensorProto.FLOAT, [3, 1], np.random.random((3, 1)).reshape(3).tolist() + ) + matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [2, 1]) + matmul2_node = onnx.helper.make_node( + "MatMul", ["gatherelements_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2" + ) + + initializers = [matmul1_weight, indices, matmul2_weight] + graph = onnx.helper.make_graph( + [matmul1_node, gathernd_node, matmul2_node], + "TestGatherElements_test_model", + [input_tensor], + [matmul2_output], + initializer=initializers, + ) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model.ir_version = 7 + + q_config = { + "Matmul_0": self.q_config, + "Matmul_2": self.q_config, + "Gatherelements_1": self.q_config, + } + + quantize_params = { + "input": [np.uint8(10.0), np.float32(0)], + "matmul1_weight": [np.uint8(10.0), np.float32(0)], + "matmul1_output": [np.uint8(10.0), np.float32(0)], + "matmul2_weight": [np.uint8(10.0), np.float32(0)], + "matmul2_output": [np.uint8(10.0), np.float32(0)], + "gatherelements_output": [np.uint8(10.0), np.float32(0)], + } + quantizable_op_types = ["MatMul", "GatherElements"] + q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) + self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) + session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(session) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/quantization/post_training_quant/test_post_training_quant.py b/test/quantization/post_training_quant/test_post_training_quant.py new file mode 100644 index 000000000..2720ff69d --- /dev/null +++ b/test/quantization/post_training_quant/test_post_training_quant.py @@ -0,0 +1,203 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import glob +import os +import shutil +import unittest +from unittest import mock + +import numpy as np +import onnx +import onnxruntime as ort +from optimum.exporters.onnx import main_export + +from onnx_neural_compressor import data_reader, quantization +from onnx_neural_compressor.quantization import config + +from typing import Callable, Dict, List, Optional, Union # isort: skip + + +def fake_eval(model, eval_result_lst): + acc = eval_result_lst.pop(0) + return acc + + +class DataReader(data_reader.CalibrationDataReader): + + def __init__(self, model): + model = onnx.load(model) + batch_size = 1 + sequence_length = 1 + self.data = { + "input_ids": np.random.randint(10, size=(batch_size, sequence_length)).astype("int64"), + "attention_mask": np.zeros((batch_size, sequence_length)).astype("int64"), + } + for inp in model.graph.input: + if inp.name in self.data: + continue + if inp.name == "position_ids": + # model is exported with optimum >= 1.14.0 with new input 'position_ids' + self.data[inp.name] = np.random.randint(10, size=(batch_size, sequence_length)).astype("int64") + + self.enum_data = None + + def get_next(self): + if self.enum_data is None: + self.enum_data = iter([self.data]) + return next(self.enum_data, None) + + def rewind(self): + self.enum_data = None + + +def _count_op_num(model, optype): + num = 0 + for node in model.graph.node: + if node.op_type == optype: + num += 1 + return num + + +class TestPostTrainingQuant(unittest.TestCase): + + @classmethod + def setUpClass(self): + main_export( + "hf-internal-testing/tiny-random-gptj", + output="model", + ) + self.model = glob.glob(os.path.join("./model", "*.onnx"))[0] + self.data_reader = DataReader(self.model) + + @classmethod + def tearDownClass(self): + shutil.rmtree("./model", ignore_errors=True) + os.remove("quant.onnx") + os.remove("quant.onnx_data") + + def test_static_quant(self): + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QInt8, + per_channel=True, + quant_last_matmul=True, + calibrate_method=quantization.CalibrationMethod.Entropy, + extra_options={"WeightSymmetric": True, "ActivationSymmetric": False}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + q_model = onnx.load("quant.onnx") + qmatmul_num_enable_last = _count_op_num(q_model, "QLinearMatMul") + + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QInt8, + calibrate_method=quantization.CalibrationMethod.Percentile, + per_channel=True, + quant_last_matmul=False, + extra_options={"WeightSymmetric": True, "ActivationSymmetric": False}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + q_model = onnx.load("quant.onnx") + node_num_basic = len(q_model.graph.node) + qmatmul_num_disable_last = _count_op_num(q_model, "QLinearMatMul") + + # check quant_last_matmul work + self.assertEqual(qmatmul_num_enable_last, qmatmul_num_disable_last + 1) + + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QUInt8, + per_channel=False, + quant_last_matmul=False, + extra_options={"WeightSymmetric": False, "ActivationSymmetric": True}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg, ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED) + q_model = onnx.load("quant.onnx") + node_num_extended = len(q_model.graph.node) + + # check graph optimization work + self.assertGreater(node_num_basic, node_num_extended) + + # check op_types_to_quantize work + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QUInt8, + per_channel=False, + quant_last_matmul=False, + op_types_to_quantize=["MatMul", "Gather"], + extra_options={"WeightSymmetric": False, "ActivationSymmetric": True}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + q_model = onnx.load("quant.onnx") + self.assertEqual(_count_op_num(q_model, "QLinearAdd"), 0) + self.assertGreater(_count_op_num(q_model, "QLinearMatMul"), 0) + + # check nodes_to_quantize work + quantizable_matmuls = [i.name.split("_quant")[0] for i in q_model.graph.node if i.op_type == "QLinearMatMul"] + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QUInt8, + nodes_to_quantize=[quantizable_matmuls[0]], + per_channel=False, + quant_last_matmul=False, + op_types_to_quantize=["MatMul", "Gather"], + extra_options={"WeightSymmetric": False, "ActivationSymmetric": True}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + q_model = onnx.load("quant.onnx") + self.assertEqual(_count_op_num(q_model, "QLinearMatMul"), 1) + + # check nodes_to_exclude work + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QUInt8, + nodes_to_exclude=[quantizable_matmuls[0]], + per_channel=False, + quant_last_matmul=False, + extra_options={"WeightSymmetric": False, "ActivationSymmetric": True}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + q_model = onnx.load("quant.onnx") + self.assertEqual(_count_op_num(q_model, "QLinearMatMul"), qmatmul_num_disable_last - 1) + + def test_dynamic_quant(self): + cfg = config.DynamicQuantConfig( + weight_type=quantization.QuantType.QInt8, + per_channel=True, + quant_last_matmul=False, + extra_options={"WeightSymmetric": True, "ActivationSymmetric": False}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + + cfg = config.DynamicQuantConfig( + weight_type=quantization.QuantType.QUInt8, + per_channel=False, + quant_last_matmul=False, + extra_options={"WeightSymmetric": False, "ActivationSymmetric": True}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg, ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/quantization/post_training_quant/test_quant_utils.py b/test/quantization/post_training_quant/test_quant_utils.py new file mode 100644 index 000000000..e98c6104d --- /dev/null +++ b/test/quantization/post_training_quant/test_quant_utils.py @@ -0,0 +1,62 @@ +import unittest + +import numpy as np +import onnx + +from onnx_neural_compressor.algorithms import utility as quant_utils + + +class TestQuantUtility(unittest.TestCase): + + def test_pad_tensor(self): + data = np.random.random((100, 32)) + group_size = 32 + k_blocks = (100 - 1) // 32 + 1 + pad_data = quant_utils.pad_tensor(data, group_size, k_blocks) + self.assertEqual(pad_data.shape, (k_blocks * group_size, 32)) + + def test_quant_dequant_data(self): + data = np.random.random((100, 32)) + qrange = quant_utils.get_qmin_qmax_for_qType( + qType=onnx.TensorProto.UINT8, + reduce_range=False, + sym=True, + ) + self.assertEqual(qrange[0], 0) + self.assertEqual(qrange[1], 255) + + rmin = np.min(np.min(data), 0) + rmax = np.max(np.max(data), 0) + + _, _, zero_point, scale, quantized_data = quant_utils.quantize_data( + data=data, + qType=onnx.TensorProto.UINT8, + sym=True, + ) + + dq_data = quant_utils.dequantize_data( + tensor_value=quantized_data, + scale_value=scale, + zo_value=zero_point, + ) + self.assertLess(np.max(np.abs(dq_data - data)), 0.005) + + _, _, zero_point, scale, quantized_data = quant_utils.quantize_data_per_channel( + data=data, + qType=onnx.TensorProto.UINT8, + sym=True, + axis=1, + ) + + dq_data = quant_utils.dequantize_data( + tensor_value=quantized_data, + scale_value=scale, + zo_value=zero_point, + axis=1, + ) + + self.assertLess(np.max(np.abs(dq_data - data)), 0.005) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/quantization/test_autotune.py b/test/quantization/test_autotune.py index 0e86c64b9..dd6ddf0db 100644 --- a/test/quantization/test_autotune.py +++ b/test/quantization/test_autotune.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -# # Copyright (c) 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -26,8 +24,8 @@ import onnxruntime as ort from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, data_reader -from onnx_neural_compressor.quantization import tuning +from onnx_neural_compressor import data_reader, quantization +from onnx_neural_compressor.quantization import config, tuning from typing import Callable, Dict, List, Optional, Union # isort: skip @@ -86,7 +84,7 @@ def setUpClass(self): def tearDownClass(self): shutil.rmtree("./gptj", ignore_errors=True) - @mock.patch("logging.Logger.warning") + @mock.patch("onnx_neural_compressor.logger.warning") def test_auto_tune_warning(self, mock_warning): acc_data = iter([1.0, 0.8, 0.99, 1.0, 0.99, 0.99]) @@ -157,16 +155,20 @@ def eval_fn_wrapper(model): self.assertIsNotNone(best_model) def test_rtn_auto_tune(self): - acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9]) - - def eval_acc_fn(model) -> float: - return next(acc_data) - - perf_data = iter([1.0, 0.99, 0.99]) - - def eval_perf_fn(model) -> float: - return next(perf_data) + eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.9]) + with self.assertRaises(SystemExit): + custom_tune_config = tuning.TuningConfig( + config_set=[config.RTNConfig(weight_group_size=32), config.RTNConfig(weight_group_size=64)] + ) + best_model = tuning.autotune( + model_input=self.gptj, + tune_config=custom_tune_config, + eval_fn=eval_acc_fn, + calibration_data_reader=self.data_reader, + ) + eval_perf_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99, 0.99]) + eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99]) eval_fns = [ {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"}, { @@ -174,24 +176,12 @@ def eval_perf_fn(model) -> float: "weight": 0.5, }, ] - evaluator = _create_evaluator_for_eval_fns(eval_fns) def eval_fn_wrapper(model): result = evaluator.evaluate(model) return result - custom_tune_config = tuning.TuningConfig( - config_set=[config.RTNConfig(weight_group_size=32), config.RTNConfig(weight_group_size=64)] - ) - best_model = tuning.autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_acc_fn, - calibration_data_reader=self.data_reader, - ) - self.assertIsNone(best_model) - custom_tune_config = tuning.TuningConfig(config_set=[config.RTNConfig(weight_group_size=[32, 64])]) best_model = tuning.autotune( model_input=self.gptj, @@ -199,26 +189,32 @@ def eval_fn_wrapper(model): eval_fn=eval_fn_wrapper, calibration_data_reader=self.data_reader, ) + self.assertEqual(len(evaluator.eval_fn_registry), 2) self.assertIsNotNone(best_model) + op_names = [ i.name for i in best_model.graph.node - if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(4, 32)) + if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(4, 64)) ] self.assertTrue(len(op_names) > 0) def test_awq_auto_tune(self): - acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9]) - - def eval_acc_fn(model) -> float: - return next(acc_data) - - perf_data = iter([1.0, 0.99, 0.99]) - - def eval_perf_fn(model) -> float: - return next(perf_data) + eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.9]) + with self.assertRaises(SystemExit): + custom_tune_config = tuning.TuningConfig( + config_set=[config.AWQConfig(weight_group_size=32), config.AWQConfig(weight_group_size=64)] + ) + best_model = tuning.autotune( + model_input=self.gptj, + tune_config=custom_tune_config, + eval_fn=eval_acc_fn, + calibration_data_reader=self.data_reader, + ) + eval_perf_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99, 0.99]) + eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99, 0.99]) eval_fns = [ {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"}, { @@ -226,24 +222,12 @@ def eval_perf_fn(model) -> float: "weight": 0.5, }, ] - evaluator = _create_evaluator_for_eval_fns(eval_fns) def eval_fn_wrapper(model): result = evaluator.evaluate(model) return result - custom_tune_config = tuning.TuningConfig( - config_set=[config.AWQConfig(weight_group_size=32), config.AWQConfig(weight_group_size=64)] - ) - best_model = tuning.autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_acc_fn, - calibration_data_reader=self.data_reader, - ) - self.assertIsNone(best_model) - custom_tune_config = tuning.TuningConfig(config_set=[config.AWQConfig(weight_group_size=[32, 64])]) best_model = tuning.autotune( model_input=self.gptj, @@ -261,16 +245,20 @@ def eval_fn_wrapper(model): self.assertTrue(len(op_names) > 0) def test_gptq_auto_tune(self): - acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9]) - - def eval_acc_fn(model) -> float: - return next(acc_data) - - perf_data = iter([1.0, 0.99, 0.99]) - - def eval_perf_fn(model) -> float: - return next(perf_data) + eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.9]) + with self.assertRaises(SystemExit): + custom_tune_config = tuning.TuningConfig( + config_set=[config.GPTQConfig(weight_group_size=32), config.GPTQConfig(weight_group_size=64)] + ) + best_model = tuning.autotune( + model_input=self.gptj, + tune_config=custom_tune_config, + eval_fn=eval_acc_fn, + calibration_data_reader=self.data_reader, + ) + eval_perf_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99, 0.99]) + eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99, 0.99]) eval_fns = [ {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"}, { @@ -284,17 +272,6 @@ def eval_fn_wrapper(model): result = evaluator.evaluate(model) return result - custom_tune_config = tuning.TuningConfig( - config_set=[config.GPTQConfig(weight_group_size=32), config.GPTQConfig(weight_group_size=64)] - ) - best_model = tuning.autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_acc_fn, - calibration_data_reader=self.data_reader, - ) - self.assertIsNone(best_model) - custom_tune_config = tuning.TuningConfig(config_set=[config.GPTQConfig(weight_group_size=[32, 64])]) best_model = tuning.autotune( model_input=self.gptj, @@ -330,7 +307,6 @@ def test_woq_auto_tune(self): if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(8, 32)) ] self.assertTrue(len(op_names) > 0) - partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.81, 1.0, 0.99, 0.99]) custom_tune_config = tuning.TuningConfig(config_set=config.get_woq_tuning_config()) @@ -370,6 +346,120 @@ def test_woq_auto_tune(self): ] self.assertTrue(len(op_names) > 0) + def test_dynamic_auto_tune(self): + partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99, 0.81, 1.0, 0.99]) + + custom_tune_config = tuning.TuningConfig(config_set=config.DynamicQuantConfig.get_config_set_for_tuning()) + best_model = tuning.autotune( + model_input=self.gptj, + tune_config=custom_tune_config, + eval_fn=partial_fake_eval, + ) + self.assertIsNotNone(best_model) + + def test_dynamic_custom_auto_tune(self): + partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99]) + custom_tune_config = tuning.TuningConfig( + config_set=config.DynamicQuantConfig( + per_channel=[True, False], + execution_provider="CPUExecutionProvider", + ) + ) + best_model = tuning.autotune( + model_input=self.gptj, + tune_config=custom_tune_config, + eval_fn=partial_fake_eval, + calibration_data_reader=self.data_reader, + ) + + optypes = [i.op_type for i in best_model.graph.node] + self.assertTrue("DynamicQuantizeLinear" in optypes) + self.assertTrue("MatMulInteger" in optypes) + ort.InferenceSession(best_model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(best_model) + + partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.82, 0.81, 1.0, 0.99]) + for execution_provider in ["DmlExecutionProvider", "TensorrtExecutionProvider"]: + with self.assertRaises(SystemExit): + custom_tune_config = tuning.TuningConfig( + config_set=config.DynamicQuantConfig( + per_channel=[True, False], + execution_provider=execution_provider, + ) + ) + best_model = tuning.autotune( + model_input=self.gptj, + tune_config=custom_tune_config, + eval_fn=partial_fake_eval, + calibration_data_reader=self.data_reader, + ) + + def test_static_default_auto_tune(self): + partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99]) + + custom_tune_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig.get_config_set_for_tuning( + execution_provider="TensorrtExecutionProvider", + quant_format=quantization.QuantFormat.QDQ, + ) + ) + best_model = tuning.autotune( + model_input=self.gptj, + tune_config=custom_tune_config, + eval_fn=partial_fake_eval, + calibration_data_reader=self.data_reader, + ) + optypes = [i.op_type for i in best_model.graph.node] + self.assertTrue("QLinearMatMul" not in optypes) + self.assertTrue("QuantizeLinear" in optypes) + self.assertTrue("MatMul" in optypes) + ort.InferenceSession(best_model.SerializeToString(), providers=["TensorrtExecutionProvider"]) + self.assertIsNotNone(best_model) + + def test_static_custom_auto_tune(self): + partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99]) + + custom_tune_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig( + per_channel=[True, False], + execution_provider="CPUExecutionProvider", + quant_format=quantization.QuantFormat.QOperator, + ) + ) + best_model = tuning.autotune( + model_input=self.gptj, + tune_config=custom_tune_config, + eval_fn=partial_fake_eval, + calibration_data_reader=self.data_reader, + ) + + optypes = [i.op_type for i in best_model.graph.node] + self.assertTrue("QLinearMatMul" in optypes) + self.assertTrue("QuantizeLinear" in optypes) + ort.InferenceSession(best_model.SerializeToString(), providers=["CPUExecutionProvider"]) + self.assertIsNotNone(best_model) + + @mock.patch("onnx_neural_compressor.logger.warning") + def test_skip_verified_config_mapping(self, mock_warning): + partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99]) + + with self.assertRaises(SystemExit): + custom_tune_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig( + per_channel=[True, False], + execution_provider="DmlExecutionProvider", + ) + ) + best_model = tuning.autotune( + model_input=self.gptj, + tune_config=custom_tune_config, + eval_fn=partial_fake_eval, + calibration_data_reader=self.data_reader, + ) + call_args_list = mock_warning.call_args_list + # There may be multiple calls to warning, so we need to check all of them + self.assertIn("Skip the verified config mapping.", [info[0][0] for info in call_args_list]) + if __name__ == "__main__": unittest.main() diff --git a/test/quantization/test_config.py b/test/quantization/test_config.py index 50ffc74d0..39c09bbf0 100644 --- a/test/quantization/test_config.py +++ b/test/quantization/test_config.py @@ -7,8 +7,9 @@ import onnx from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, logger, utility +from onnx_neural_compressor import logger, quantization, utility from onnx_neural_compressor.quantization import algorithm_entry as algos +from onnx_neural_compressor.quantization import config, tuning def find_onnx_file(folder_path): @@ -83,6 +84,243 @@ def _count_woq_matmul(self, q_model, bits=4, group_size=32): ] return len(op_names) + def test_dynamic_quant_config(self): + for execution_provider in ["CPUExecutionProvider", "CUDAExecutionProvider", "DnnlExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.DynamicQuantConfig.get_config_set_for_tuning( + execution_provider=execution_provider, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + if idx == 0: + self.assertTrue(configs_mapping["Matmul"]["per_channel"]) + elif idx == 1: + self.assertFalse(configs_mapping["Matmul"]["per_channel"]) + if 3 < idx < 8: + self.assertTrue("LSTM" not in quant_config.op_types_to_quantize) + elif 7 < idx < 12: + self.assertTrue("Conv" not in quant_config.op_types_to_quantize) + elif 11 < idx < 16: + self.assertTrue("Attention" not in quant_config.op_types_to_quantize) + elif 15 < idx < 20: + self.assertTrue("MatMul" not in quant_config.op_types_to_quantize) + self.assertLess(idx, 20) + self.assertTrue("add" not in configs_mapping and "add2" not in configs_mapping) + + for execution_provider in ["DmlExecutionProvider", "TensorrtExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.DynamicQuantConfig.get_config_set_for_tuning( + execution_provider=execution_provider, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + self.assertTrue("add" not in configs_mapping) + self.assertTrue("add2" not in configs_mapping) + self.assertTrue("Matmul" not in configs_mapping) + + self.assertEqual(len(config_loader.config_set), 20) + + def test_dynamic_custom_quant_config(self): + for execution_provider in ["CPUExecutionProvider", "CUDAExecutionProvider", "DnnlExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.DynamicQuantConfig( + per_channel=[True, False], + execution_provider=execution_provider, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + if idx == 0: + self.assertTrue(configs_mapping["Matmul"]["per_channel"]) + elif idx == 1: + self.assertFalse(configs_mapping["Matmul"]["per_channel"]) + self.assertLess(idx, 2) + self.assertTrue("add" not in configs_mapping and "add2" not in configs_mapping) + + for execution_provider in ["DmlExecutionProvider", "TensorrtExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.DynamicQuantConfig( + per_channel=[True, False], + execution_provider=execution_provider, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + self.assertTrue("add" not in configs_mapping) + self.assertTrue("add2" not in configs_mapping) + self.assertTrue("Matmul" not in configs_mapping) + self.assertLess(idx, 4) + + self.assertEqual(len(config_loader.config_set), 2) + + def test_static_quant_config(self): + for execution_provider in ["CPUExecutionProvider", "CUDAExecutionProvider", "DnnlExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig.get_config_set_for_tuning( + execution_provider=execution_provider, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + if idx in [0, 4]: + self.assertTrue(configs_mapping["Matmul"]["per_channel"]) + elif idx in [1, 5]: + self.assertFalse(configs_mapping["Matmul"]["per_channel"]) + if idx < 4: + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") + else: + self.assertFalse("add" in configs_mapping) + if idx in [0, 1]: + self.assertEqual(configs_mapping["Matmul"]["calibrate_method"], "MinMax") + self.assertLess(idx, 16) + + for execution_provider in ["TensorrtExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig.get_config_set_for_tuning( + execution_provider=execution_provider, + quant_format=quantization.QuantFormat.QOperator, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + self.assertTrue("add" not in configs_mapping) + self.assertTrue("add2" not in configs_mapping) + self.assertTrue("Matmul" not in configs_mapping) + + self.assertEqual(len(config_loader.config_set), 16) + + for execution_provider in ["DmlExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig.get_config_set_for_tuning( + execution_provider=execution_provider, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + if "Matmul" in configs_mapping: + self.assertFalse(configs_mapping["Matmul"]["per_channel"]) + self.assertEqual(configs_mapping["Matmul"]["calibrate_method"], "MinMax") + if "add" in configs_mapping: + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") + self.assertLess(idx, 16) + + for execution_provider in ["TensorrtExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig.get_config_set_for_tuning( + execution_provider=execution_provider, + quant_format=quantization.QuantFormat.QDQ, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + if idx in [0, 4]: + self.assertTrue(configs_mapping["Matmul"]["per_channel"]) + elif idx in [1, 5]: + self.assertFalse(configs_mapping["Matmul"]["per_channel"]) + if "add" in configs_mapping: + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") + self.assertTrue(configs_mapping["add"]["weight_sym"]) + self.assertTrue(configs_mapping["add"]["activation_sym"]) + if "Matmul" in configs_mapping: + self.assertTrue(configs_mapping["Matmul"]["weight_sym"]) + self.assertTrue(configs_mapping["Matmul"]["activation_sym"]) + self.assertLess(idx, 16) + + def test_static_custom_quant_config(self): + for execution_provider in ["CPUExecutionProvider", "CUDAExecutionProvider", "DnnlExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig( + per_channel=[True, False], + execution_provider=execution_provider, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + if idx == 0: + self.assertTrue(configs_mapping["Matmul"]["per_channel"]) + elif idx == 1: + self.assertFalse(configs_mapping["Matmul"]["per_channel"]) + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") + + self.assertLess(idx, 2) + + for execution_provider in ["TensorrtExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig( + per_channel=[True, False], + execution_provider=execution_provider, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + self.assertTrue("add" not in configs_mapping) + self.assertTrue("add2" not in configs_mapping) + self.assertTrue("Matmul" not in configs_mapping) + + # only 1 config without op level quant config + self.assertEqual(len(config_loader.config_set), 2) + + for execution_provider in ["DmlExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig( + per_channel=[True, False], + execution_provider=execution_provider, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + self.assertFalse(configs_mapping["Matmul"]["per_channel"]) + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") + self.assertLess(idx, 4) + + for execution_provider in ["TensorrtExecutionProvider"]: + tuning_config = tuning.TuningConfig( + config_set=config.StaticQuantConfig( + per_channel=[True, False], + execution_provider=execution_provider, + quant_format=quantization.QuantFormat.QDQ, + ) + ) + config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler) + for idx, quant_config in enumerate(config_loader): + model_info = quant_config.get_model_info(model=self.simple_onnx_model) + configs_mapping = quant_config.to_config_mapping(model_info=model_info) + if idx == 0: + self.assertTrue(configs_mapping["Matmul"]["per_channel"]) + elif idx == 1: + self.assertFalse(configs_mapping["Matmul"]["per_channel"]) + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") + self.assertTrue(configs_mapping["add"]["weight_sym"]) + self.assertTrue(configs_mapping["add"]["activation_sym"]) + self.assertTrue(configs_mapping["Matmul"]["weight_sym"]) + self.assertTrue(configs_mapping["Matmul"]["activation_sym"]) + self.assertLess(idx, 2) + def test_config_white_lst(self): global_config = config.RTNConfig(weight_bits=4) # set operator instance @@ -113,12 +351,12 @@ def test_config_white_lst3(self): quant_config = global_config + fc_out_config # get model and quantize fp32_model = self.gptj - model_info = utility.get_model_info(fp32_model, white_op_type_list=["MatMul"]) + model_info = config.RTNConfig.get_model_info(fp32_model) logger.info(quant_config) configs_mapping = quant_config.to_config_mapping(model_info=model_info) logger.info(configs_mapping) - self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 8) - self.assertTrue(configs_mapping[("/h.4/mlp/fc_in/MatMul", "MatMul")].weight_bits == 4) + self.assertTrue(configs_mapping["/h.4/mlp/fc_out/MatMul"]["weight_bits"] == 8) + self.assertTrue(configs_mapping["/h.4/mlp/fc_in/MatMul"]["weight_bits"] == 4) def test_config_from_dict(self): quant_config = { @@ -170,6 +408,7 @@ def test_same_type_configs_addition(self): }, } } + q_config2 = config.RTNConfig.from_dict(quant_config2["rtn"]) q_config3 = q_config + q_config2 q3_dict = q_config3.to_dict() @@ -185,21 +424,21 @@ def test_config_mapping(self): quant_config.set_local("/h.4/mlp/fc_out/MatMul", fc_out_config) # get model and quantize fp32_model = self.gptj - model_info = utility.get_model_info(fp32_model, white_op_type_list=["MatMul"]) + model_info = config.RTNConfig.get_model_info(fp32_model) logger.info(quant_config) configs_mapping = quant_config.to_config_mapping(model_info=model_info) logger.info(configs_mapping) - self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 8) - self.assertTrue(configs_mapping[("/h.4/mlp/fc_in/MatMul", "MatMul")].weight_bits == 4) + self.assertTrue(configs_mapping["/h.4/mlp/fc_out/MatMul"]["weight_bits"] == 8) + self.assertTrue(configs_mapping["/h.4/mlp/fc_in/MatMul"]["weight_bits"] == 4) # test regular matching fc_config = config.RTNConfig(weight_bits=3) quant_config.set_local("/h.[1-4]/mlp/fc_out/MatMul", fc_config) configs_mapping = quant_config.to_config_mapping(model_info=model_info) logger.info(configs_mapping) - self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3) - self.assertTrue(configs_mapping[("/h.3/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3) - self.assertTrue(configs_mapping[("/h.2/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3) - self.assertTrue(configs_mapping[("/h.1/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3) + self.assertTrue(configs_mapping["/h.4/mlp/fc_out/MatMul"]["weight_bits"] == 3) + self.assertTrue(configs_mapping["/h.3/mlp/fc_out/MatMul"]["weight_bits"] == 3) + self.assertTrue(configs_mapping["/h.2/mlp/fc_out/MatMul"]["weight_bits"] == 3) + self.assertTrue(configs_mapping["/h.1/mlp/fc_out/MatMul"]["weight_bits"] == 3) def test_diff_types_configs_addition(self): quant_config1 = { @@ -219,12 +458,12 @@ def test_diff_types_configs_addition(self): class TestQuantConfigForAutotune(unittest.TestCase): - def test_expand_config(self): + def test_expand_woq_config(self): # test the expand functionalities, the user is not aware it tune_config = config.RTNConfig(weight_bits=[4, 8]) expand_config_list = config.RTNConfig.expand(tune_config) - self.assertEqual(expand_config_list[0].weight_bits, 4) - self.assertEqual(expand_config_list[1].weight_bits, 8) + self.assertEqual(expand_config_list[0]["weight_bits"], 4) + self.assertEqual(expand_config_list[1]["weight_bits"], 8) if __name__ == "__main__": diff --git a/test/quantization/test_smooth_quant.py b/test/quantization/test_smooth_quant.py index fed59e142..242417e6e 100644 --- a/test/quantization/test_smooth_quant.py +++ b/test/quantization/test_smooth_quant.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -# # Copyright (c) 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,12 +19,13 @@ import numpy as np import onnx +import onnxruntime as ort from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, data_reader +from onnx_neural_compressor import data_reader from onnx_neural_compressor.quantization import QuantType from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import quantize +from onnx_neural_compressor.quantization import config, quantize class DataReader(data_reader.CalibrationDataReader): @@ -72,6 +71,7 @@ def setUpClass(self): @classmethod def tearDownClass(self): shutil.rmtree("./gptj", ignore_errors=True) + os.remove("Optimized_model.onnx") def test_sq_from_class_beginner(self): self.data_reader.rewind() @@ -111,6 +111,43 @@ def test_sq_with_ort_like_api(self): self.assertTrue(3 not in [i.data_type for i in model.graph.initializer]) self.assertEqual(num_muls, 30) + def test_smooth_quant_args(self): + self.data_reader.rewind() + sq_config = config.SmoothQuantConfig( + weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, alpha="auto" + ) + model = algos.smooth_quant_entry(self.gptj, sq_config, self.data_reader) + num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) + self.assertEqual(num_muls, 30) + + self.data_reader.rewind() + sq_config = config.SmoothQuantConfig( + weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, scales_per_op=False + ) + model = algos.smooth_quant_entry(self.gptj, sq_config, self.data_reader) + num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) + self.assertEqual(num_muls, 15) + + sess_options = ort.SessionOptions() + sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED + sess_options.optimized_model_filepath = "Optimized_model.onnx" + sess = ort.InferenceSession(self.gptj, sess_options, providers=["CPUExecutionProvider"]) + self.data_reader.rewind() + sq_config = config.SmoothQuantConfig( + weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, folding=True, scales_per_op=False + ) + model = algos.smooth_quant_entry("Optimized_model.onnx", sq_config, self.data_reader) + num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) + self.assertEqual(num_muls, 10) + + self.data_reader.rewind() + sq_config = config.SmoothQuantConfig( + weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, folding=False, scales_per_op=False + ) + model = algos.smooth_quant_entry("Optimized_model.onnx", sq_config, self.data_reader) + num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) + self.assertEqual(num_muls, 15) + if __name__ == "__main__": unittest.main() diff --git a/test/quantization/weight_only/test_awq.py b/test/quantization/weight_only/test_awq.py index 2d918cc61..e1c23d495 100644 --- a/test/quantization/weight_only/test_awq.py +++ b/test/quantization/weight_only/test_awq.py @@ -8,9 +8,9 @@ import transformers from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, data_reader, logger +from onnx_neural_compressor import data_reader, logger from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer +from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer def find_onnx_file(folder_path): diff --git a/test/quantization/weight_only/test_gptq.py b/test/quantization/weight_only/test_gptq.py index 133e11fd1..1e674b7dd 100644 --- a/test/quantization/weight_only/test_gptq.py +++ b/test/quantization/weight_only/test_gptq.py @@ -8,9 +8,9 @@ import transformers from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, data_reader, logger +from onnx_neural_compressor import data_reader, logger from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer +from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer def find_onnx_file(folder_path): diff --git a/test/quantization/weight_only/test_rtn.py b/test/quantization/weight_only/test_rtn.py index 86b3c49a3..aa3672d0c 100644 --- a/test/quantization/weight_only/test_rtn.py +++ b/test/quantization/weight_only/test_rtn.py @@ -6,9 +6,9 @@ from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, logger +from onnx_neural_compressor import logger from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer +from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer def find_onnx_file(folder_path): diff --git a/test/utils/test_general.py b/test/utils/test_general.py index d24392438..b07d73115 100644 --- a/test/utils/test_general.py +++ b/test/utils/test_general.py @@ -2,8 +2,8 @@ import unittest -from onnx_neural_compressor import config, constants, logger -from onnx_neural_compressor.quantization import tuning +from onnx_neural_compressor import constants, logger +from onnx_neural_compressor.quantization import config, tuning from typing import Any, Callable, List, Optional, Tuple, Union # isort: skip @@ -192,7 +192,10 @@ def test_api(self): self.assertEqual(fake_default_config.weight_dtype, "int") config_set = get_all_config_set() self.assertEqual(len(config_set), len(config.config_registry.get_all_config_cls())) - self.assertEqual([i for i in config_set if i.name == FAKE_CONFIG_NAME][0].weight_bits, DEFAULT_WEIGHT_BITS) + self.assertEqual( + [i for i in config_set if getattr(i, "name", "None") == FAKE_CONFIG_NAME][0].weight_bits, + DEFAULT_WEIGHT_BITS, + ) def test_config_expand_complex_tunable_type(self): target_op_type_list_options = [["Conv", "Gemm"], ["Conv", "Matmul"]] @@ -211,8 +214,98 @@ def test_mixed_two_algos(self): mixed_config = fake_config + fake1_config model_info = mixed_config.get_model_info(model) config_mapping = mixed_config.to_config_mapping(model_info=model_info) - self.assertIn(OP1_NAME, [op_info[0] for op_info in config_mapping]) - self.assertIn(OP2_NAME, [op_info[0] for op_info in config_mapping]) + self.assertIn(OP1_NAME, config_mapping) + self.assertIn(OP2_NAME, config_mapping) + + def test_config_expand(self) -> None: + cfg = config.RTNConfig( + weight_bits=[4, 8], weight_sym=[True, False], layer_wise_quant=[True, False], providers=[["CPU"], ["CUDA"]] + ) + expand_cfgs = cfg.expand() + self.assertEqual(expand_cfgs[0].weight_bits, 4) + self.assertEqual(expand_cfgs[0].weight_sym, True) + self.assertEqual(expand_cfgs[0].layer_wise_quant, True) + self.assertEqual(expand_cfgs[0].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[1].weight_bits, 8) + self.assertEqual(expand_cfgs[1].weight_sym, True) + self.assertEqual(expand_cfgs[1].layer_wise_quant, True) + self.assertEqual(expand_cfgs[1].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[2].weight_bits, 4) + self.assertEqual(expand_cfgs[2].weight_sym, False) + self.assertEqual(expand_cfgs[2].layer_wise_quant, True) + self.assertEqual(expand_cfgs[2].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[3].weight_bits, 8) + self.assertEqual(expand_cfgs[3].weight_sym, False) + self.assertEqual(expand_cfgs[3].layer_wise_quant, True) + self.assertEqual(expand_cfgs[3].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[4].weight_bits, 4) + self.assertEqual(expand_cfgs[4].weight_sym, True) + self.assertEqual(expand_cfgs[4].layer_wise_quant, True) + self.assertEqual(expand_cfgs[4].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[5].weight_bits, 8) + self.assertEqual(expand_cfgs[5].weight_sym, True) + self.assertEqual(expand_cfgs[5].layer_wise_quant, True) + self.assertEqual(expand_cfgs[5].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[6].weight_bits, 4) + self.assertEqual(expand_cfgs[6].weight_sym, False) + self.assertEqual(expand_cfgs[6].layer_wise_quant, True) + self.assertEqual(expand_cfgs[6].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[7].weight_bits, 8) + self.assertEqual(expand_cfgs[7].weight_sym, False) + self.assertEqual(expand_cfgs[7].layer_wise_quant, True) + self.assertEqual(expand_cfgs[7].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[8].weight_bits, 4) + self.assertEqual(expand_cfgs[8].weight_sym, True) + self.assertEqual(expand_cfgs[8].layer_wise_quant, False) + self.assertEqual(expand_cfgs[8].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[9].weight_bits, 8) + self.assertEqual(expand_cfgs[9].weight_sym, True) + self.assertEqual(expand_cfgs[9].layer_wise_quant, False) + self.assertEqual(expand_cfgs[9].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[10].weight_bits, 4) + self.assertEqual(expand_cfgs[10].weight_sym, False) + self.assertEqual(expand_cfgs[10].layer_wise_quant, False) + self.assertEqual(expand_cfgs[10].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[11].weight_bits, 8) + self.assertEqual(expand_cfgs[11].weight_sym, False) + self.assertEqual(expand_cfgs[11].layer_wise_quant, False) + self.assertEqual(expand_cfgs[11].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[12].weight_bits, 4) + self.assertEqual(expand_cfgs[12].weight_sym, True) + self.assertEqual(expand_cfgs[12].layer_wise_quant, False) + self.assertEqual(expand_cfgs[12].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[13].weight_bits, 8) + self.assertEqual(expand_cfgs[13].weight_sym, True) + self.assertEqual(expand_cfgs[13].layer_wise_quant, False) + self.assertEqual(expand_cfgs[13].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[14].weight_bits, 4) + self.assertEqual(expand_cfgs[14].weight_sym, False) + self.assertEqual(expand_cfgs[14].layer_wise_quant, False) + self.assertEqual(expand_cfgs[14].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[15].weight_bits, 8) + self.assertEqual(expand_cfgs[15].weight_sym, False) + self.assertEqual(expand_cfgs[15].layer_wise_quant, False) + self.assertEqual(expand_cfgs[15].providers, ["CUDA"]) + + def test_config_expand_with_empty_options(self): + configs = FakeAlgoConfig(weight_dtype=["int", "float32"], weight_bits=[]) + configs_list = configs.expand() + self.assertEqual(len(configs_list), 2) class TestConfigSet(unittest.TestCase): @@ -247,6 +340,14 @@ def test_config_loader(self) -> None: for i, cfg in enumerate(self.loader): self.assertEqual(cfg, self.config_set[i]) + def test_config_loader_skip_verified_config(self) -> None: + config_set = [FakeAlgoConfig(weight_bits=[4, 8]), FakeAlgoConfig(weight_bits=8)] + config_loader = tuning.ConfigLoader(config_set) + config_count = 0 + for i, _ in enumerate(config_loader): + config_count += 1 + self.assertEqual(config_count, 2) + if __name__ == "__main__": unittest.main() diff --git a/test/utils/test_param.py b/test/utils/test_param.py index fd8b7d3d3..549e1fb47 100644 --- a/test/utils/test_param.py +++ b/test/utils/test_param.py @@ -3,7 +3,7 @@ import unittest from typing import List -from onnx_neural_compressor import config +from onnx_neural_compressor.quantization import config class TestTuningParam(unittest.TestCase): @@ -20,6 +20,9 @@ def test_is_tunable_recursive(self): self.assertTrue(param.is_tunable([[5, 6], [7, 8]])) # TODO: double check if this is the expected behavior self.assertTrue(param.is_tunable([[5, 6], [7, "8"]])) + self.assertEqual( + str(param), "TuningParam(name=param_name, tunable_type=typing.List[typing.List[int]], options=None)." + ) if __name__ == "__main__": diff --git a/test/utils/test_utility.py b/test/utils/test_utility.py index fa7a4812f..50ce620b9 100644 --- a/test/utils/test_utility.py +++ b/test/utils/test_utility.py @@ -17,26 +17,6 @@ def test_set_random_seed(self): with self.assertRaises(AssertionError): utility.set_random_seed(seed) - def test_set_workspace(self): - workspace = "/path/to/workspace" - utility.set_workspace(workspace) - self.assertEqual(utility.options.workspace, workspace) - - # non String type - workspace = 12345 - with self.assertRaises(AssertionError): - utility.set_workspace(workspace) - - def test_set_resume_from(self): - resume_from = "/path/to/resume" - utility.set_resume_from(resume_from) - self.assertEqual(utility.options.resume_from, resume_from) - - # non String type - resume_from = 12345 - with self.assertRaises(AssertionError): - utility.set_resume_from(resume_from) - class TestCPUInfo(unittest.TestCase):