From 49aeb9cbeef4352c3bbb61c88028612880c28afe Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Fri, 28 Jun 2024 14:03:21 +0800
Subject: [PATCH] Add W8A8 quant and examples (#24)

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 examples/.config/model_params_onnxrt.json     |   30 +-
 .../quantization/ptq_static/README.md         |   63 +
 .../resnet50/quantization/ptq_static/main.py  |  277 ++
 .../quantization/ptq_static/prepare_model.py  |   57 +
 .../quantization/ptq_static/requirements.txt  |    6 +
 .../quantization/ptq_static/run_benchmark.sh  |   49 +
 .../quantization/ptq_static/run_quant.sh      |   47 +
 .../bert/quantization/ptq_dynamic/README.md   |   51 +
 .../nlp/bert/quantization/ptq_dynamic/main.py |  442 ++++
 .../quantization/ptq_dynamic/prepare_data.sh  |   34 +
 .../quantization/ptq_dynamic/prepare_model.py |  118 +
 .../quantization/ptq_dynamic/requirements.txt |    8 +
 .../quantization/ptq_dynamic/run_benchmark.sh |   64 +
 .../quantization/ptq_dynamic/run_quant.sh     |   47 +
 .../bert/quantization/ptq_static/README.md    |   60 +
 .../nlp/bert/quantization/ptq_static/main.py  |  509 ++++
 .../quantization/ptq_static/prepare_data.sh   |   34 +
 .../quantization/ptq_static/prepare_model.py  |  118 +
 .../quantization/ptq_static/requirements.txt  |    8 +
 .../quantization/ptq_static/run_benchmark.sh  |   64 +
 .../bert/quantization/ptq_static/run_quant.sh |   53 +
 .../llama/quantization/weight_only/main.py    |   26 +-
 onnx_neural_compressor/__init__.py            |    2 -
 .../algorithms/layer_wise/core.py             |   17 +-
 .../post_training_quant/__init__.py           |   13 +
 .../post_training_quant/calibrate.py          |  637 +++++
 .../post_training_quant/calibrator.py         |  401 +++
 .../post_training_quant/operators/__init__.py |   27 +
 .../operators/activation.py                   |  112 +
 .../post_training_quant/operators/argmax.py   |   40 +
 .../operators/attention.py                    |   71 +
 .../post_training_quant/operators/base_op.py  |   92 +
 .../operators/binary_op.py                    |  150 ++
 .../post_training_quant/operators/concat.py   |  125 +
 .../post_training_quant/operators/conv.py     |  201 ++
 .../operators/direct_q8.py                    |   78 +
 .../operators/embed_layernorm.py              |   68 +
 .../post_training_quant/operators/gather.py   |  109 +
 .../post_training_quant/operators/gavgpool.py |   59 +
 .../post_training_quant/operators/gemm.py     |   91 +
 .../post_training_quant/operators/lstm.py     |  138 +
 .../post_training_quant/operators/matmul.py   |  168 ++
 .../post_training_quant/operators/maxpool.py  |   74 +
 .../post_training_quant/operators/pad.py      |  102 +
 .../post_training_quant/operators/pooling.py  |   81 +
 .../post_training_quant/operators/reduce.py   |   83 +
 .../post_training_quant/operators/resize.py   |   75 +
 .../post_training_quant/operators/split.py    |   88 +
 .../post_training_quant/operators/unary_op.py |   80 +
 .../post_training_quant/quantizer.py          | 1246 +++++++++
 .../algorithms/smoother/core.py               |   59 +-
 onnx_neural_compressor/algorithms/utility.py  |  702 +++++
 .../algorithms/weight_only/awq.py             |  150 +-
 .../algorithms/weight_only/gptq.py            |  105 +-
 .../algorithms/weight_only/rtn.py             |   79 +-
 .../algorithms/weight_only/utility.py         |  332 ---
 onnx_neural_compressor/config.py              | 1239 ---------
 onnx_neural_compressor/constants.py           |  279 +-
 onnx_neural_compressor/data_reader.py         |   27 +-
 onnx_neural_compressor/onnx_model.py          |  289 ++-
 .../quantization/__init__.py                  |    4 +-
 .../quantization/algorithm_entry.py           |  246 +-
 .../quantization/calibrate.py                 |   32 -
 onnx_neural_compressor/quantization/config.py | 2249 +++++++++++++++++
 .../quantization/matmul_4bits_quantizer.py    |    6 +-
 .../quantization/matmul_nbits_quantizer.py    |   81 +-
 .../quantization/quant_utils.py               |   47 +
 .../quantization/quantize.py                  |   47 +-
 onnx_neural_compressor/quantization/tuning.py |  155 +-
 onnx_neural_compressor/utility.py             |  377 +--
 onnx_neural_compressor/version.py             |    2 -
 requirements.txt                              |    2 +
 .../layer_wise/test_layer_wise.py             |    8 +-
 .../post_training_quant/test_calibrate.py     |  588 +++++
 .../post_training_quant/test_operators.py     | 1957 ++++++++++++++
 .../test_post_training_quant.py               |  203 ++
 .../post_training_quant/test_quant_utils.py   |   62 +
 test/quantization/test_autotune.py            |  228 +-
 test/quantization/test_config.py              |  267 +-
 test/quantization/test_smooth_quant.py        |   45 +-
 test/quantization/weight_only/test_awq.py     |    4 +-
 test/quantization/weight_only/test_gptq.py    |    4 +-
 test/quantization/weight_only/test_rtn.py     |    4 +-
 test/utils/test_general.py                    |  111 +-
 test/utils/test_param.py                      |    5 +-
 test/utils/test_utility.py                    |   20 -
 86 files changed, 14081 insertions(+), 2527 deletions(-)
 create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/README.md
 create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/main.py
 create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/prepare_model.py
 create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/requirements.txt
 create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh
 create mode 100644 examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh
 create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/README.md
 create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/main.py
 create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh
 create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py
 create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/requirements.txt
 create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh
 create mode 100644 examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh
 create mode 100644 examples/nlp/bert/quantization/ptq_static/README.md
 create mode 100644 examples/nlp/bert/quantization/ptq_static/main.py
 create mode 100644 examples/nlp/bert/quantization/ptq_static/prepare_data.sh
 create mode 100644 examples/nlp/bert/quantization/ptq_static/prepare_model.py
 create mode 100644 examples/nlp/bert/quantization/ptq_static/requirements.txt
 create mode 100644 examples/nlp/bert/quantization/ptq_static/run_benchmark.sh
 create mode 100644 examples/nlp/bert/quantization/ptq_static/run_quant.sh
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/__init__.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/calibrate.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/calibrator.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/split.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py
 create mode 100644 onnx_neural_compressor/algorithms/post_training_quant/quantizer.py
 create mode 100644 onnx_neural_compressor/algorithms/utility.py
 delete mode 100644 onnx_neural_compressor/algorithms/weight_only/utility.py
 delete mode 100644 onnx_neural_compressor/config.py
 delete mode 100644 onnx_neural_compressor/quantization/calibrate.py
 create mode 100644 onnx_neural_compressor/quantization/config.py
 create mode 100644 onnx_neural_compressor/quantization/quant_utils.py
 create mode 100644 test/quantization/post_training_quant/test_calibrate.py
 create mode 100644 test/quantization/post_training_quant/test_operators.py
 create mode 100644 test/quantization/post_training_quant/test_post_training_quant.py
 create mode 100644 test/quantization/post_training_quant/test_quant_utils.py

diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json
index 5db34a114..085c7ef6c 100644
--- a/examples/.config/model_params_onnxrt.json
+++ b/examples/.config/model_params_onnxrt.json
@@ -55,6 +55,34 @@
         "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past",
         "main_script": "main.py",
         "batch_size": 1
-      }
+      },
+      "bert_base_MRPC": {
+         "model_src_dir": "nlp/bert/quantization/ptq_static",
+         "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+         "input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx",
+         "main_script": "main.py",
+         "batch_size": 8
+      },
+      "bert_base_MRPC_dynamic": {
+         "model_src_dir": "nlp/bert/quantization/ptq_dynamic",
+         "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+         "input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx",
+         "main_script": "main.py",
+         "batch_size": 8
+      },
+      "resnet50-v1-12_qdq": {
+         "model_src_dir": "image_recognition/resnet50/quantization/ptq_static",
+         "dataset_location": "/tf_dataset2/datasets/imagenet/ImagenetRaw/ILSVRC2012_img_val",
+         "input_model": "/tf_dataset2/models/onnx/resnet50-v1-12/resnet50-v1-13.onnx",
+         "main_script": "main.py",
+         "batch_size": 1
+      },
+      "resnet50-v1-12": {
+         "model_src_dir": "image_recognition/resnet50/quantization/ptq_static",
+         "dataset_location": "/tf_dataset2/datasets/imagenet/ImagenetRaw/ILSVRC2012_img_val",
+         "input_model": "/tf_dataset2/models/onnx/resnet50-v1-12/resnet50-v1-12.onnx",
+         "main_script": "main.py",
+         "batch_size": 1
+      },
     }
   }
diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/README.md b/examples/image_recognition/resnet50/quantization/ptq_static/README.md
new file mode 100644
index 000000000..b8145eff8
--- /dev/null
+++ b/examples/image_recognition/resnet50/quantization/ptq_static/README.md
@@ -0,0 +1,63 @@
+# Step-by-Step
+
+This example load an image classification model from [ONNX Model Zoo](https://github.com/onnx/models) and confirm its accuracy and speed based on [ILSVR2012 validation Imagenet dataset](http://www.image-net.org/challenges/LSVRC/2012/downloads). You need to download this dataset yourself.
+
+# Prerequisite
+
+## 1. Environment
+
+```shell
+pip install onnx-neural-compressor
+pip install -r requirements.txt
+```
+
+
+## 2. Prepare Model
+
+```shell
+python prepare_model.py --output_model='resnet50-v1-12.onnx'
+```
+
+## 3. Prepare Dataset
+
+Download dataset [ILSVR2012 validation Imagenet dataset](http://www.image-net.org/challenges/LSVRC/2012/downloads).
+
+Download label:
+
+```shell
+wget http://dl.caffe.berkeleyvision.org/caffe_ilsvrc12.tar.gz
+tar -xvzf caffe_ilsvrc12.tar.gz val.txt
+```
+
+# Run
+
+
+## 1. Quantization
+
+Quantize model with QLinearOps:
+
+```bash
+bash run_quant.sh --input_model=path/to/model \  # model path as *.onnx
+                   --dataset_location=/path/to/imagenet \
+                   --label_path=/path/to/val.txt \
+                   --output_model=path/to/save
+```
+
+Quantize model with QDQ mode:
+
+```bash
+bash run_quant.sh --input_model=path/to/model \  # model path as *.onnx
+                   --dataset_location=/path/to/imagenet \
+                   --label_path=/path/to/val.txt \
+                   --output_model=path/to/save \
+                   --quant_format=QDQ
+```
+
+## 2. Benchmark
+
+```bash
+bash run_benchmark.sh --input_model=path/to/model \  # model path as *.onnx
+                      --dataset_location=/path/to/imagenet \
+                      --label_path=/path/to/val.txt \
+                      --mode=performance # or accuracy
+```
diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/main.py b/examples/image_recognition/resnet50/quantization/ptq_static/main.py
new file mode 100644
index 000000000..8b6506e1e
--- /dev/null
+++ b/examples/image_recognition/resnet50/quantization/ptq_static/main.py
@@ -0,0 +1,277 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+
+import argparse
+import collections
+import logging
+import os
+import re
+import time
+
+import cv2
+import numpy as np
+import onnx
+import onnxruntime as ort
+from PIL import Image
+from sklearn import metrics
+
+from onnx_neural_compressor import data_reader, quantization
+from onnx_neural_compressor.quantization import config, tuning
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN
+)
+
+
+def _topk_shape_validate(preds, labels):
+    # preds shape can be Nxclass_num or class_num(N=1 by default)
+    # it's more suitable for 'Accuracy' with preds shape Nx1(or 1) output from argmax
+    if isinstance(preds, int):
+        preds = [preds]
+        preds = np.array(preds)
+    elif isinstance(preds, np.ndarray):
+        preds = np.array(preds)
+    elif isinstance(preds, list):
+        preds = np.array(preds)
+        preds = preds.reshape((-1, preds.shape[-1]))
+
+    # consider labels just int value 1x1
+    if isinstance(labels, int):
+        labels = [labels]
+        labels = np.array(labels)
+    elif isinstance(labels, tuple):
+        labels = np.array([labels])
+        labels = labels.reshape((labels.shape[-1], -1))
+    elif isinstance(labels, list):
+        if isinstance(labels[0], int):
+            labels = np.array(labels)
+            labels = labels.reshape((labels.shape[0], 1))
+        elif isinstance(labels[0], tuple):
+            labels = np.array(labels)
+            labels = labels.reshape((labels.shape[-1], -1))
+        else:
+            labels = np.array(labels)
+    # labels most have 2 axis, 2 cases: N(or Nx1 sparse) or Nxclass_num(one-hot)
+    # only support 2 dimension one-shot labels
+    # or 1 dimension one-hot class_num will confuse with N
+
+    if len(preds.shape) == 1:
+        N = 1
+        class_num = preds.shape[0]
+        preds = preds.reshape([-1, class_num])
+    elif len(preds.shape) >= 2:
+        N = preds.shape[0]
+        preds = preds.reshape([N, -1])
+        class_num = preds.shape[1]
+
+    label_N = labels.shape[0]
+    assert label_N == N, "labels batch size should same with preds"
+    labels = labels.reshape([N, -1])
+    # one-hot labels will have 2 dimension not equal 1
+    if labels.shape[1] != 1:
+        labels = labels.argsort()[..., -1:]
+    return preds, labels
+
+
+class TopK:
+    def __init__(self, k=1):
+        self.k = k
+        self.num_correct = 0
+        self.num_sample = 0
+
+    def update(self, preds, labels, sample_weight=None):
+        preds, labels = _topk_shape_validate(preds, labels)
+        preds = preds.argsort()[..., -self.k :]
+        if self.k == 1:
+            correct = metrics.accuracy_score(preds, labels, normalize=False)
+            self.num_correct += correct
+
+        else:
+            for p, l in zip(preds, labels):
+                # get top-k labels with np.argpartition
+                # p = np.argpartition(p, -self.k)[-self.k:]
+                l = l.astype("int32")
+                if l in p:
+                    self.num_correct += 1
+
+        self.num_sample += len(labels)
+
+    def reset(self):
+        self.num_correct = 0
+        self.num_sample = 0
+
+    def result(self):
+        if self.num_sample == 0:
+            logger.warning("Sample num during evaluation is 0.")
+            return 0
+        return self.num_correct / self.num_sample
+
+
+class DataReader(data_reader.CalibrationDataReader):
+    def __init__(self, model_path, dataset_location, image_list, batch_size=1, calibration_sampling_size=-1):
+        self.batch_size = batch_size
+        self.image_list = []
+        self.label_list = []
+        src_lst = []
+        label_lst = []
+        num = 0
+        with open(image_list, "r") as f:
+            for s in f:
+                image_name, label = re.split(r"\s+", s.strip())
+                src = os.path.join(dataset_location, image_name)
+                if not os.path.exists(src):
+                    continue
+                src_lst.append(src)
+                label_lst.append(int(label))
+                if len(src_lst) == batch_size:
+                    self.image_list.append(src_lst)
+                    self.label_list.append(label_lst)
+                    num += batch_size
+                    if calibration_sampling_size > 0 and num >= calibration_sampling_size:
+                        break
+                    src_lst = []
+                    label_lst = []
+        if len(src_lst) > 0:
+            self.image_list.append(src_lst)
+            self.label_list.append(label_lst)
+        model = onnx.load(model_path, load_external_data=False)
+        self.inputs_names = [input.name for input in model.graph.input]
+        self.iter_next = iter(self.image_list)
+
+    def _preprpcess(self, src):
+        with Image.open(src) as image:
+            image = np.array(image.convert("RGB")).astype(np.float32)
+            image = image / 255.0
+            image = cv2.resize(image, (256, 256), interpolation=cv2.INTER_LINEAR)
+
+            h, w = image.shape[0], image.shape[1]
+
+            y0 = (h - 224) // 2
+            x0 = (w - 224) // 2
+            image = image[y0 : y0 + 224, x0 : x0 + 224, :]
+            image = (image - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
+            image = image.transpose((2, 0, 1))
+        return image.astype("float32")
+
+    def get_next(self):
+        lst = next(self.iter_next, None)
+        if lst is not None:
+            return {self.inputs_names[0]: np.stack([self._preprpcess(src) for src in lst])}
+        else:
+            return None
+
+    def rewind(self):
+        self.iter_next = iter(self.image_list)
+
+
+def eval_func(model, dataloader, metric):
+    metric.reset()
+    sess = ort.InferenceSession(model, providers=ort.get_available_providers())
+    labels = dataloader.label_list
+    for idx, batch in enumerate(dataloader):
+        output = sess.run(None, batch)
+        metric.update(output, labels[idx])
+    return metric.result()
+
+
+if __name__ == "__main__":
+    logger.info("Evaluating ONNXRuntime full precision accuracy and performance:")
+    parser = argparse.ArgumentParser(
+        description="Resnet50 fine-tune examples for image classification tasks.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--model_path", type=str, help="Pre-trained model on onnx file")
+    parser.add_argument("--dataset_location", type=str, help="Imagenet data path")
+    parser.add_argument("--label_path", type=str, help="Imagenet label path")
+    parser.add_argument("--benchmark", action="store_true", default=False)
+    parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model")
+    parser.add_argument("--output_model", type=str, help="output model path")
+    parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy")
+    parser.add_argument(
+        "--intra_op_num_threads", type=int, default=4, help="intra_op_num_threads for performance benchmark"
+    )
+    parser.add_argument(
+        "--quant_format", type=str, default="QOperator", choices=["QDQ", "QOperator"], help="quantization format"
+    )
+    parser.add_argument(
+        "--batch_size",
+        default=1,
+        type=int,
+    )
+    args = parser.parse_args()
+
+    top1 = TopK()
+    dataloader = DataReader(args.model_path, args.dataset_location, args.label_path, args.batch_size)
+
+    def eval(onnx_model):
+        dataloader.rewind()
+        return eval_func(onnx_model, dataloader, top1)
+
+    if args.benchmark:
+        if args.mode == "performance":
+            total_time = 0.0
+            num_iter = 100
+            num_warmup = 10
+
+            sess_options = ort.SessionOptions()
+            sess_options.intra_op_num_threads = args.intra_op_num_threads
+            session = ort.InferenceSession(args.model_path, sess_options, providers=ort.get_available_providers())
+            ort_inputs = {}
+            len_inputs = len(session.get_inputs())
+            inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
+
+            for idx, batch in enumerate(dataloader):
+                if idx + 1 > num_iter:
+                    break
+                tic = time.time()
+                predictions = session.run(None, batch)
+                toc = time.time()
+                if idx >= num_warmup:
+                    total_time += toc - tic
+
+            print("\n", "-" * 10, "Summary:", "-" * 10)
+            print(args)
+            throughput = (num_iter - num_warmup) / total_time
+            print("Throughput: {} samples/s".format(throughput))
+        elif args.mode == "accuracy":
+            acc_result = eval_func(args.model_path, dataloader, top1)
+            print("Batch size = %d" % dataloader.batch_size)
+            print("Accuracy: %.5f" % acc_result)
+
+    if args.tune:
+        calibration_data_reader = DataReader(
+            args.model_path, args.dataset_location, args.label_path, args.batch_size, calibration_sampling_size=100
+        )
+
+        custom_tune_config = tuning.TuningConfig(
+            config_set=config.StaticQuantConfig.get_config_set_for_tuning(
+                quant_format=(
+                    quantization.QuantFormat.QOperator
+                    if args.quant_format == "QOperator"
+                    else quantization.QuantFormat.QDQ
+                ),
+            )
+        )
+        best_model = tuning.autotune(
+            model_input=args.model_path,
+            tune_config=custom_tune_config,
+            eval_fn=eval,
+            calibration_data_reader=calibration_data_reader,
+        )
+        onnx.save(best_model, args.output_model)
diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/prepare_model.py b/examples/image_recognition/resnet50/quantization/ptq_static/prepare_model.py
new file mode 100644
index 000000000..8d7d8d4a9
--- /dev/null
+++ b/examples/image_recognition/resnet50/quantization/ptq_static/prepare_model.py
@@ -0,0 +1,57 @@
+import argparse
+import os
+import sys
+import urllib
+
+MODEL_URL = "https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-v1-12.onnx"
+MAX_TIMES_RETRY_DOWNLOAD = 5
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_model", type=str, required=False, default="resnet50-v1-12.onnx")
+    parser.add_argument("--output_model", type=str, required=True)
+    return parser.parse_args()
+
+
+def progressbar(cur, total=100):
+    percent = "{:.2%}".format(cur / total)
+    sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent))
+    sys.stdout.flush()
+
+
+def schedule(blocknum, blocksize, totalsize):
+    if totalsize == 0:
+        percent = 0
+    else:
+        percent = min(1.0, blocknum * blocksize / totalsize) * 100
+    progressbar(percent)
+
+
+def download_model(url, model_name, retry_times=5):
+    if os.path.isfile(model_name):
+        print(f"{model_name} exists, skip download")
+        return True
+
+    print("download model...")
+    retries = 0
+    while retries < retry_times:
+        try:
+            urllib.request.urlretrieve(url, model_name, schedule)
+            break
+        except KeyboardInterrupt:
+            return False
+        except:
+            retries += 1
+            print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}")
+    return retries < retry_times
+
+
+def prepare_model(input_model, output_model):
+    # Download model from [ONNX Model Zoo](https://github.com/onnx/models)
+    download_model(MODEL_URL, output_model, MAX_TIMES_RETRY_DOWNLOAD)
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    prepare_model(args.input_model, args.output_model)
diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/requirements.txt b/examples/image_recognition/resnet50/quantization/ptq_static/requirements.txt
new file mode 100644
index 000000000..1fc10dd8a
--- /dev/null
+++ b/examples/image_recognition/resnet50/quantization/ptq_static/requirements.txt
@@ -0,0 +1,6 @@
+onnx
+onnxruntime
+torch
+torchvision
+onnxruntime-extensions
+pillow>=8.2.0 # not directly required, pinned by Snyk to avoid a vulnerability
diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh b/examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh
new file mode 100644
index 000000000..65c3505be
--- /dev/null
+++ b/examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --input_model=*)
+          input_model=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --label_path=*)
+          label_path=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --intra_op_num_threads=*)
+          intra_op_num_threads=$(echo "$var" |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_benchmark
+function run_benchmark {
+
+    python main.py \
+            --model_path "${input_model}" \
+            --dataset_location "${dataset_location}" \
+            --label_path "${label_path-${dataset_location}/../val.txt}" \
+            --mode "${mode}" \
+            --batch_size 1 \
+            --intra_op_num_threads "${intra_op_num_threads-4}" \
+            --benchmark
+            
+}
+
+main "$@"
diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh b/examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh
new file mode 100644
index 000000000..0e44d8d02
--- /dev/null
+++ b/examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+
+  for var in "$@"
+  do
+    case $var in
+      --input_model=*)
+          input_model=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --output_model=*)
+          output_model=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --label_path=*)
+          label_path=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --quant_format=*)
+          quant_format=$(echo "$var" |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    python main.py \
+            --model_path "${input_model}" \
+            --dataset_location "${dataset_location}" \
+            --label_path "${label_path-${dataset_location}/../val.txt}" \
+            --output_model "${output_model}" \
+            --quant_format "${quant_format-QOperator}" \
+            --tune
+}
+
+main "$@"
diff --git a/examples/nlp/bert/quantization/ptq_dynamic/README.md b/examples/nlp/bert/quantization/ptq_dynamic/README.md
new file mode 100644
index 000000000..212c8b899
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_dynamic/README.md
@@ -0,0 +1,51 @@
+# Step-by-Step
+
+This example load a BERT model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/).
+
+# Prerequisite
+
+## 1. Environment
+
+```shell
+pip install onnx-neural-compressor
+pip install -r requirements.txt
+```
+
+
+## 2. Prepare Dataset
+
+download the GLUE data with `prepare_data.sh` script.
+
+```shell
+export GLUE_DIR=path/to/glue_data
+export TASK_NAME=MRPC
+
+bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME
+```
+
+## 3. Prepare Model
+
+```shell
+python prepare_model.py --input_model='MRPC.zip' --output_model='bert.onnx'
+```
+
+# Run
+
+## 1. Quantization
+
+Dynamic quantization:
+
+```bash
+bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx
+                   --output_model=path/to/model_tune \ # model path as *.onnx
+                   --dataset_location=path/to/glue_data
+```
+
+## 2. Benchmark
+
+```bash
+bash run_benchmark.sh --input_model=path/to/model \ # model path as *.onnx
+                      --dataset_location=path/to/glue_data \
+                      --batch_size=batch_size \
+                      --mode=performance # or accuracy
+```
diff --git a/examples/nlp/bert/quantization/ptq_dynamic/main.py b/examples/nlp/bert/quantization/ptq_dynamic/main.py
new file mode 100644
index 000000000..38b0b6757
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_dynamic/main.py
@@ -0,0 +1,442 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+
+import argparse
+import dataclasses
+import logging
+import os
+import pathlib
+import tempfile
+import time
+from typing import List, Optional, Union
+
+import numpy as np
+import onnx
+import onnxruntime
+import torch
+import transformers
+from onnxruntime.transformers import optimizer
+from onnxruntime.transformers.fusion_options import FusionOptions
+from torch.utils import data
+
+from onnx_neural_compressor.quantization import config, tuning
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN
+)
+
+
+class ONNXRTBertDataset:
+    """Dataset used for model Bert.
+    Args: data_dir (str): The input data dir.
+          model_name_or_path (str): Path to pre-trained student model or shortcut name,
+                                    selected in the list:
+          max_seq_length (int, default=128): The maximum length after tokenization.
+                                Sequences longer than this will be truncated,
+                                sequences shorter will be padded.
+          do_lower_case (bool, default=True): Whether to lowercase the input when tokenizing.
+          task (str, default=mrpc): The name of the task to fine-tune.
+                                    Choices include mrpc, qqp, qnli, rte,
+                                    sts-b, cola, mnli, wnli.
+          model_type (str, default='bert'): model type, support 'distilbert', 'bert',
+                                            'mobilebert', 'roberta'.
+          dynamic_length (bool, default=False): Whether to use fixed sequence length.
+          evaluate (bool, default=True): Whether do evaluation or training.
+          transform (transform object, default=None):  transform to process input data.
+          filter (Filter objects, default=None): filter out examples according
+                                                 to specific conditions.
+    """
+
+    def __init__(
+        self,
+        model,
+        data_dir,
+        model_name_or_path,
+        max_seq_length=128,
+        do_lower_case=True,
+        task="mrpc",
+        model_type="bert",
+        dynamic_length=False,
+        evaluate=True,
+        transform=None,
+        filter=None,
+    ):
+        self.inputs = [inp.name for inp in onnx.load(model).graph.input]
+        task = task.lower()
+        model_type = model_type.lower()
+        assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type"
+        assert model_type in [
+            "distilbert",
+            "bert",
+            "mobilebert",
+            "roberta",
+        ], "Unsupported \
+            model type"
+        self.dynamic_length = dynamic_length
+        self.model_type = model_type
+        self.max_seq_length = max_seq_length
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
+        self.dataset = load_and_cache_examples(
+            data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate
+        )
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+        batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in self.dataset[index])
+        return batch[: len(self.inputs)], batch[-1]
+
+
+def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate):
+    processor = transformers.glue_processors[task]()
+    output_mode = transformers.glue_output_modes[task]
+    # Load data features from cache or dataset file
+    if not os.path.exists("./dataset_cached"):
+        os.makedirs("./dataset_cached")
+    cached_features_file = os.path.join(
+        "./dataset_cached",
+        "cached_{}_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, model_name_or_path.split("/"))).pop(),
+            str(max_seq_length),
+            str(task),
+        ),
+    )
+    if os.path.exists(cached_features_file):
+        logger.info("Load features from cached file {}.".format(cached_features_file))
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Create features from dataset file at {}.".format(data_dir))
+        label_list = processor.get_labels()
+        examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
+        features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            task=task,
+            label_list=label_list,
+            max_length=max_seq_length,
+            output_mode=output_mode,
+        )
+        logger.info("Save features into cached file {}.".format(cached_features_file))
+        torch.save(features, cached_features_file)
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+    dataset = data.TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_seq_lengths, all_labels)
+    return dataset
+
+
+def convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_length=128,
+    task=None,
+    label_list=None,
+    output_mode="classification",
+    pad_token=0,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
+    processor = transformers.glue_processors[task]()
+    if label_list is None:
+        label_list = processor.get_labels()
+        logger.info("Use label list {} for task {}.".format(label_list, task))
+    label_map = {label: i for i, label in enumerate(label_list)}
+    features = []
+    for ex_index, example in enumerate(examples):
+        inputs = tokenizer.encode_plus(
+            example.text_a,
+            example.text_b,
+            add_special_tokens=True,
+            max_length=max_length,
+            return_token_type_ids=True,
+            truncation=True,
+        )
+        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        seq_length = len(input_ids)
+        padding_length = max_length - len(input_ids)
+
+        input_ids = input_ids + ([pad_token] * padding_length)
+        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+
+        assert len(input_ids) == max_length, "Error with input_ids length {} vs {}".format(len(input_ids), max_length)
+        assert len(attention_mask) == max_length, "Error with attention_mask length {} vs {}".format(
+            len(attention_mask), max_length
+        )
+        assert len(token_type_ids) == max_length, "Error with token_type_ids length {} vs {}".format(
+            len(token_type_ids), max_length
+        )
+        if output_mode == "classification":
+            label = label_map[example.label]
+        elif output_mode == "regression":
+            label = float(example.label)
+        else:
+            raise KeyError(output_mode)
+
+        feats = InputFeatures(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            label=label,
+            seq_length=seq_length,
+        )
+        features.append(feats)
+    return features
+
+
+@dataclasses.dataclass(frozen=True)
+class InputFeatures:
+    """
+    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED,
+            ``0`` for MASKED (padded) tokens.
+        token_type_ids: (Optional) Segment token indices to indicate first and second
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
+        seq_length: (Optional) The length of input sequence before padding.
+    """
+
+    input_ids: List[int]
+    attention_mask: Optional[List[int]] = None
+    token_type_ids: Optional[List[int]] = None
+    label: Optional[Union[int, float]] = None
+    seq_length: Optional[List[int]] = None
+
+
+class ONNXRTGLUE:
+    """Computes GLUE score.
+
+    Args:
+        task (str, default=mrpc): The name of the task.
+                                  Choices include mrpc, qqp, qnli, rte,
+                                  sts-b, cola, mnli, wnli.
+
+    """
+
+    def __init__(self, task="mrpc"):
+        assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type"
+        self.pred_list = None
+        self.label_list = None
+        self.task = task
+        self.return_key = {
+            "cola": "mcc",
+            "mrpc": "acc",
+            "sts-b": "corr",
+            "qqp": "acc",
+            "mnli": "mnli/acc",
+            "qnli": "acc",
+            "rte": "acc",
+            "wnli": "acc",
+            "sst-2": "acc",
+        }
+
+    def update(self, preds, labels):
+        """add preds and labels to storage"""
+        if isinstance(preds, list) and len(preds) == 1:
+            preds = preds[0]
+        if isinstance(labels, list) and len(labels) == 1:
+            labels = labels[0]
+        if self.pred_list is None:
+            self.pred_list = preds
+            self.label_list = labels
+        else:
+            self.pred_list = np.append(self.pred_list, preds, axis=0)
+            self.label_list = np.append(self.label_list, labels, axis=0)
+
+    def reset(self):
+        """clear preds and labels storage"""
+        self.pred_list = None
+        self.label_list = None
+
+    def result(self):
+        """calculate metric"""
+        output_mode = transformers.glue_output_modes[self.task]
+
+        if output_mode == "classification":
+            processed_preds = np.argmax(self.pred_list, axis=1)
+        elif output_mode == "regression":
+            processed_preds = np.squeeze(self.pred_list)
+        result = transformers.glue_compute_metrics(self.task, processed_preds, self.label_list)
+        return result[self.return_key[self.task]]
+
+
+if __name__ == "__main__":
+    logger.info("Evaluating ONNXRuntime full precision accuracy and performance:")
+    parser = argparse.ArgumentParser(
+        description="BERT fine-tune examples for classification/regression tasks.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--model_path", type=str, help="Pre-trained resnet50 model on onnx file")
+    parser.add_argument("--benchmark", action="store_true", default=False)
+    parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model")
+    parser.add_argument("--output_model", type=str, help="output model path")
+    parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy")
+    parser.add_argument("--model_name_or_path", type=str, help="pretrained model name or path")
+    parser.add_argument("--data_path", type=str, help="input data path")
+    parser.add_argument(
+        "--batch_size",
+        default=8,
+        type=int,
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        default="mrpc",
+        choices=["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"],
+        help="GLUE task name",
+    )
+    parser.add_argument("--dynamic_length", type=bool, default=False, help="dynamic length")
+    parser.add_argument("--max_seq_length", type=int, default=128, help="max sequence length")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="bert",
+        choices=["distilbert", "bert", "mobilebert", "roberta"],
+        help="model type",
+    )
+    parser.add_argument("--intra_op_num_threads", type=int, default=4)
+    args = parser.parse_args()
+    dataset = ONNXRTBertDataset(
+        args.model_path,
+        data_dir=args.data_path,
+        model_name_or_path=args.model_name_or_path,
+        max_seq_length=args.max_seq_length,
+        task=args.task,
+        model_type=args.model_type,
+        dynamic_length=args.dynamic_length,
+    )
+    dataloader = data.DataLoader(
+        dataset,
+        sampler=data.SequentialSampler(dataset),
+        batch_size=args.batch_size,
+        shuffle=False,
+    )
+
+    def eval_func(model):
+        metric = ONNXRTGLUE(args.task)
+        session = onnxruntime.InferenceSession(model, providers=onnxruntime.get_available_providers())
+        ort_inputs = {}
+        len_inputs = len(session.get_inputs())
+        inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
+
+        for idx, batch in enumerate(dataloader):
+            label = batch[-1]
+            batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0])
+            batch_seq_length = args.max_seq_length if not args.dynamic_length else batch[0].shape[-1]
+            data = [
+                batch[0][:, :batch_seq_length],
+                batch[1][:, :batch_seq_length],
+                batch[2][:, :batch_seq_length],
+            ]
+            for i in range(len_inputs):
+                ort_inputs.update({inputs_names[i]: data[i]})
+            predictions = session.run(None, ort_inputs)
+            metric.update(predictions[0], label)
+        return metric.result()
+
+    if args.benchmark:
+        if args.mode == "performance":
+            total_time = 0.0
+            num_iter = 100
+            num_warmup = 10
+
+            sess_options = onnxruntime.SessionOptions()
+            sess_options.intra_op_num_threads = args.intra_op_num_threads
+            session = onnxruntime.InferenceSession(
+                args.model_path, sess_options, providers=onnxruntime.get_available_providers()
+            )
+            ort_inputs = {}
+            len_inputs = len(session.get_inputs())
+            inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
+
+            for idx, batch in enumerate(dataloader):
+                if idx + 1 > num_iter:
+                    break
+
+                batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0])
+                batch_seq_length = args.max_seq_length if not args.dynamic_length else batch[0].shape[-1]
+                data = [
+                    batch[0][:, :batch_seq_length],
+                    batch[1][:, :batch_seq_length],
+                    batch[2][:, :batch_seq_length],
+                ]
+                for i in range(len_inputs):
+                    ort_inputs.update({inputs_names[i]: data[i]})
+                tic = time.time()
+                predictions = session.run(None, ort_inputs)
+                toc = time.time()
+                if idx >= num_warmup:
+                    total_time += toc - tic
+
+            print("\n", "-" * 10, "Summary:", "-" * 10)
+            print(args)
+            throughput = (num_iter - num_warmup) / total_time
+            print("Throughput: {} samples/s".format(throughput))
+        elif args.mode == "accuracy":
+            acc_result = eval_func(args.model_path)
+            print("Batch size = %d" % args.batch_size)
+            print("Accuracy: %.5f" % acc_result)
+
+    if args.tune:
+        # optimize model
+        with tempfile.TemporaryDirectory(prefix="ort.opt.") as tmp_dir:
+            opt_options = FusionOptions("bert")
+            opt_options.enable_embed_layer_norm = False
+
+            model_optimizer = optimizer.optimize_model(
+                args.model_path, "bert", num_heads=12, hidden_size=768, optimization_options=opt_options
+            )
+            model = model_optimizer.model
+
+            # check the optimized model is valid
+            try:
+                onnxruntime.InferenceSession(model.SerializeToString(), providers=onnxruntime.get_available_providers())
+                onnx.save(model, pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix())
+                model = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix()
+            except Exception as e:
+                logger.warning("Optimized model is invalid: {}. ".format(e))
+                logger.warning("Model optimizer will be skipped. " "Try to upgrade onnxruntime to avoid this error")
+                model = args.model_path
+
+            custom_tune_config = tuning.TuningConfig(config_set=config.DynamicQuantConfig.get_config_set_for_tuning())
+            best_model = tuning.autotune(
+                model_input=model,
+                tune_config=custom_tune_config,
+                eval_fn=eval_func,
+                optimization_level=onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL,
+            )
+            onnx.save(best_model, args.output_model)
diff --git a/examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh b/examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh
new file mode 100644
index 000000000..c1fddb546
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  download_data
+
+}
+
+# init params
+function init_params {
+
+  for var in "$@"
+  do
+    case $var in
+      --data_dir=*)
+          data_dir=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --task_name=*)
+          task_name=$(echo "$var" |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function download_data {
+    wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py
+    python download_glue_data.py --data_dir="${data_dir}" --tasks="${task_name}"
+}
+
+main "$@"
+
diff --git a/examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py b/examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py
new file mode 100644
index 000000000..5b9216640
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py
@@ -0,0 +1,118 @@
+import argparse
+import os
+import sys
+import urllib
+import zipfile
+
+import torch
+import transformers
+
+# Please refer to [Bert-GLUE_OnnxRuntime_quantization guide]
+# (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb)
+# for detailed model export.
+
+MODEL_URL = "https://download.pytorch.org/tutorial/MRPC.zip"
+MAX_TIMES_RETRY_DOWNLOAD = 5
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_model", type=str, required=False, default="MRPC.zip")
+    parser.add_argument("--output_model", type=str, required=True)
+    parser.add_argument("--max_len", type=int, default=128, help="Maximum length of the sentence pairs")
+    return parser.parse_args()
+
+
+def progressbar(cur, total=100):
+    percent = "{:.2%}".format(cur / total)
+    sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent))
+    sys.stdout.flush()
+
+
+def schedule(blocknum, blocksize, totalsize):
+    if totalsize == 0:
+        percent = 0
+    else:
+        percent = min(1.0, blocknum * blocksize / totalsize) * 100
+    progressbar(percent)
+
+
+def is_zip_file(filename):
+    try:
+        with open(filename, "rb") as f:
+            magic_number = f.read(4)
+            return magic_number == b"PK\x03\x04"  # ZIP file magic number
+    except OSError:
+        return False
+
+
+def extrafile(filename, target_folder="."):
+    with zipfile.ZipFile(filename, "r") as zin:
+        zin.extractall(target_folder)
+
+
+def download_model(url, model_name, retry_times=5):
+    if os.path.isdir(model_name):
+        return model_name
+    elif os.path.exists(model_name) and is_zip_file(model_name):
+        print("file downloaded")
+        extrafile(model_name)
+        return True
+
+    print("download model...")
+    retries = 0
+    while retries < retry_times:
+        try:
+            urllib.request.urlretrieve(url, model_name, schedule)
+            extrafile(model_name)
+            break
+        except KeyboardInterrupt:
+            return False
+        except:
+            retries += 1
+            print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}")
+    return retries < retry_times
+
+
+def export_model(model, output_model, max_len=128):
+    with torch.no_grad():
+        inputs = {
+            "input_ids": torch.ones(1, max_len, dtype=torch.int64),
+            "attention_mask": torch.ones(1, max_len, dtype=torch.int64),
+            "token_type_ids": torch.ones(1, max_len, dtype=torch.int64),
+        }
+
+        symbolic_names = {0: "batch_size", 1: "max_seq_len"}
+        torch.onnx.export(
+            model,  # model being run
+            (
+                inputs["input_ids"],
+                inputs["attention_mask"],
+                inputs["token_type_ids"],
+            ),  # model input (or a tuple for multiple inputs)
+            output_model,  # where to save the model (can be a file or file-like object)
+            opset_version=14,  # the ONNX version to export the model
+            do_constant_folding=True,  # whether to execute constant folding
+            input_names=["input_ids", "input_mask", "segment_ids"],  # the model's input names
+            output_names=["output"],  # the model's output names
+            dynamic_axes={
+                "input_ids": symbolic_names,  # variable length axes
+                "input_mask": symbolic_names,
+                "segment_ids": symbolic_names,
+            },
+        )
+        assert os.path.exists(output_model), f"Export failed! {output_model} doesn't exist!"
+        print("ONNX Model exported to {0}".format(output_model))
+
+
+def prepare_model(input_model, output_model, max_len):
+    is_download_successful = download_model(MODEL_URL, input_model, MAX_TIMES_RETRY_DOWNLOAD)
+    if is_download_successful:
+        folder_name = is_download_successful if isinstance(is_download_successful, str) else "./MRPC"
+        model = transformers.BertForSequenceClassification.from_pretrained(folder_name)
+        export_model(model, output_model, max_len)
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    prepare_model(args.input_model, args.output_model, args.max_len)
diff --git a/examples/nlp/bert/quantization/ptq_dynamic/requirements.txt b/examples/nlp/bert/quantization/ptq_dynamic/requirements.txt
new file mode 100644
index 000000000..85dc725a4
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_dynamic/requirements.txt
@@ -0,0 +1,8 @@
+torch
+transformers
+accelerate
+onnx
+onnxruntime
+coloredlogs
+sympy
+onnxruntime-extensions
diff --git a/examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh b/examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh
new file mode 100644
index 000000000..b92ae1ce1
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --input_model=*)
+          input_model=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --intra_op_num_threads=*)
+          intra_op_num_threads=$(echo "$var" |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_benchmark
+function run_benchmark {
+    if [[ ${mode} == "accuracy" ]]; then
+      dynamic_length=False
+    elif [[ ${mode} == "performance" ]]; then
+      dynamic_length=True
+    else
+      echo "Error: No such mode: ${mode}"
+      exit 1
+    fi
+
+    model_name_or_path="bert-base-uncased"
+    task_name="mrpc"
+
+    python main.py \
+           --model_path "${input_model}" \
+           --model_name_or_path "${model_name_or_path}" \
+           --data_path "${dataset_location}" \
+           --task "${task_name}" \
+           --batch_size "${batch_size}" \
+           --mode "${mode}" \
+           --dynamic_length "${dynamic_length}" \
+           --intra_op_num_threads "${intra_op_num_threads-4}" \
+           --benchmark
+            
+}
+
+main "$@"
+
diff --git a/examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh b/examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh
new file mode 100644
index 000000000..53e864930
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_tuning
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --input_model=*)
+          input_model=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --output_model=*)
+          output_model=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo "$var" |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    model_name_or_path="bert-base-uncased"
+    batch_size=8
+    task_name="mrpc"
+
+    python main.py \
+           --model_path "${input_model}" \
+           --output_model "${output_model}" \
+           --model_name_or_path "${model_name_or_path}" \
+           --data_path "${dataset_location}" \
+           --task "${task_name}" \
+           --batch_size "${batch_size}" \
+           --tune
+}
+
+main "$@"
+
+
+
diff --git a/examples/nlp/bert/quantization/ptq_static/README.md b/examples/nlp/bert/quantization/ptq_static/README.md
new file mode 100644
index 000000000..c34e76a79
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_static/README.md
@@ -0,0 +1,60 @@
+Step-by-Step
+============
+
+This example load a BERT model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/). 
+
+# Prerequisite
+
+## 1. Environment
+
+```shell
+pip install onnx-neural-compressor
+pip install -r requirements.txt
+```
+
+## 2. Prepare Dataset
+
+download the GLUE data with `prepare_data.sh` script.
+```shell
+export GLUE_DIR=path/to/glue_data
+export TASK_NAME=MRPC
+
+bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME
+```
+
+## 3. Prepare Model
+
+```shell
+python prepare_model.py --input_model='MRPC.zip' --output_model='bert.onnx'
+```
+
+# Run
+
+## 1. Quantization
+
+Static quantization with QOperator format:
+
+```bash
+bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx
+                   --output_model=path/to/model_tune \
+                   --dataset_location=path/to/glue_data \
+                   --quant_format="QOperator"
+```
+
+Static quantization with QDQ format:
+
+```bash
+bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx
+                   --output_model=path/to/model_tune \ # model path as *.onnx
+                   --dataset_location=path/to/glue_data \
+                   --quant_format="QDQ"
+```
+
+## 2. Benchmark
+
+```bash
+bash run_benchmark.sh --input_model=path/to/model \ # model path as *.onnx
+                      --dataset_location=path/to/glue_data \
+                      --batch_size=batch_size \
+                      --mode=performance # or accuracy
+```
diff --git a/examples/nlp/bert/quantization/ptq_static/main.py b/examples/nlp/bert/quantization/ptq_static/main.py
new file mode 100644
index 000000000..bfaf55504
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_static/main.py
@@ -0,0 +1,509 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+
+import argparse
+import dataclasses
+import logging
+import os
+import pathlib
+import tempfile
+import time
+from typing import List, Optional, Union
+
+import numpy as np
+import onnx
+import onnxruntime
+import torch
+import transformers
+from onnxruntime.transformers import optimizer
+from onnxruntime.transformers.fusion_options import FusionOptions
+from torch.utils import data
+
+from onnx_neural_compressor import data_reader, quantization
+from onnx_neural_compressor.quantization import config, tuning
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN
+)
+logger.info("Evaluating ONNXRuntime full precision accuracy and performance:")
+parser = argparse.ArgumentParser(
+    description="BERT fine-tune examples for classification/regression tasks.",
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+)
+parser.add_argument("--model_path", type=str, help="Pre-trained model on onnx file")
+parser.add_argument("--benchmark", action="store_true", default=False)
+parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model")
+parser.add_argument("--output_model", type=str, help="output model path")
+parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy")
+parser.add_argument("--model_name_or_path", type=str, help="pretrained model name or path")
+parser.add_argument("--data_path", type=str, help="input data path")
+parser.add_argument(
+    "--batch_size",
+    default=8,
+    type=int,
+)
+parser.add_argument(
+    "--task",
+    type=str,
+    default="mrpc",
+    choices=["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"],
+    help="GLUE task name",
+)
+parser.add_argument(
+    "--quant_format", type=str, default="QOperator", choices=["QDQ", "QOperator"], help="quantization format"
+)
+parser.add_argument(
+    "--intra_op_num_threads", type=int, default=4, help="intra_op_num_threads for performance benchmark"
+)
+parser.add_argument("--dynamic_length", type=bool, default=False, help="dynamic length")
+parser.add_argument("--max_seq_length", type=int, default=128, help="max sequence length")
+parser.add_argument(
+    "--model_type", type=str, default="bert", choices=["distilbert", "bert", "mobilebert", "roberta"], help="model type"
+)
+parser.add_argument(
+    "--device",
+    type=str,
+    default="cpu",
+    choices=["cpu", "npu"],
+)
+args = parser.parse_args()
+
+
+class ONNXRTBertDataset:
+    """Dataset used for model Bert.
+    Args: data_dir (str): The input data dir.
+          model_name_or_path (str): Path to pre-trained student model or shortcut name,
+                                    selected in the list:
+          max_seq_length (int, default=128): The maximum length after tokenization.
+                                Sequences longer than this will be truncated,
+                                sequences shorter will be padded.
+          do_lower_case (bool, default=True): Whether to lowercase the input when tokenizing.
+          task (str, default=mrpc): The name of the task to fine-tune.
+                                    Choices include mrpc, qqp, qnli, rte,
+                                    sts-b, cola, mnli, wnli.
+          model_type (str, default="bert"): model type, support "distilbert", "bert",
+                                            "mobilebert", "roberta".
+          dynamic_length (bool, default=False): Whether to use fixed sequence length.
+          evaluate (bool, default=True): Whether do evaluation or training.
+          transform (transform object, default=None):  transform to process input data.
+          filter (Filter objects, default=None): filter out examples according
+                                                 to specific conditions.
+    """
+
+    def __init__(
+        self,
+        model,
+        data_dir,
+        model_name_or_path,
+        max_seq_length=128,
+        do_lower_case=True,
+        task="mrpc",
+        model_type="bert",
+        dynamic_length=False,
+        evaluate=True,
+        transform=None,
+        filter=None,
+    ):
+        self.inputs = [inp.name for inp in onnx.load(model).graph.input]
+        task = task.lower()
+        model_type = model_type.lower()
+        assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type"
+        assert model_type in [
+            "distilbert",
+            "bert",
+            "mobilebert",
+            "roberta",
+        ], "Unsupported \
+            model type"
+        self.dynamic_length = dynamic_length
+        self.model_type = model_type
+        self.max_seq_length = max_seq_length
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
+        self.dataset = load_and_cache_examples(
+            data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate
+        )
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+        batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in self.dataset[index])
+        return batch[: len(self.inputs)], batch[-1]
+
+
+def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate):
+    processor = transformers.glue_processors[task]()
+    output_mode = transformers.glue_output_modes[task]
+    # Load data features from cache or dataset file
+    if not os.path.exists("./dataset_cached"):
+        os.makedirs("./dataset_cached")
+    cached_features_file = os.path.join(
+        "./dataset_cached",
+        "cached_{}_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, model_name_or_path.split("/"))).pop(),
+            str(max_seq_length),
+            str(task),
+        ),
+    )
+    if os.path.exists(cached_features_file):
+        logger.info("Load features from cached file {}.".format(cached_features_file))
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Create features from dataset file at {}.".format(data_dir))
+        label_list = processor.get_labels()
+        examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
+        features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            task=task,
+            label_list=label_list,
+            max_length=max_seq_length,
+            output_mode=output_mode,
+        )
+        logger.info("Save features into cached file {}.".format(cached_features_file))
+        torch.save(features, cached_features_file)
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+    dataset = data.TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_seq_lengths, all_labels)
+    return dataset
+
+
+def convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_length=128,
+    task=None,
+    label_list=None,
+    output_mode="classification",
+    pad_token=0,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
+    processor = transformers.glue_processors[task]()
+    if label_list is None:
+        label_list = processor.get_labels()
+        logger.info("Use label list {} for task {}.".format(label_list, task))
+    label_map = {label: i for i, label in enumerate(label_list)}
+    features = []
+    for ex_index, example in enumerate(examples):
+        inputs = tokenizer.encode_plus(
+            example.text_a,
+            example.text_b,
+            add_special_tokens=True,
+            max_length=max_length,
+            return_token_type_ids=True,
+            truncation=True,
+        )
+        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        seq_length = len(input_ids)
+        padding_length = max_length - len(input_ids)
+
+        input_ids = input_ids + ([pad_token] * padding_length)
+        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+
+        assert len(input_ids) == max_length, "Error with input_ids length {} vs {}".format(len(input_ids), max_length)
+        assert len(attention_mask) == max_length, "Error with attention_mask length {} vs {}".format(
+            len(attention_mask), max_length
+        )
+        assert len(token_type_ids) == max_length, "Error with token_type_ids length {} vs {}".format(
+            len(token_type_ids), max_length
+        )
+        if output_mode == "classification":
+            label = label_map[example.label]
+        elif output_mode == "regression":
+            label = float(example.label)
+        else:
+            raise KeyError(output_mode)
+
+        feats = InputFeatures(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            label=label,
+            seq_length=seq_length,
+        )
+        features.append(feats)
+    return features
+
+
+@dataclasses.dataclass(frozen=True)
+class InputFeatures:
+    """
+    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED,
+            ``0`` for MASKED (padded) tokens.
+        token_type_ids: (Optional) Segment token indices to indicate first and second
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
+        seq_length: (Optional) The length of input sequence before padding.
+    """
+
+    input_ids: List[int]
+    attention_mask: Optional[List[int]] = None
+    token_type_ids: Optional[List[int]] = None
+    label: Optional[Union[int, float]] = None
+    seq_length: Optional[List[int]] = None
+
+
+class ONNXRTGLUE:
+    """Computes GLUE score.
+
+    Args:
+        task (str, default=mrpc): The name of the task.
+                                  Choices include mrpc, qqp, qnli, rte,
+                                  sts-b, cola, mnli, wnli.
+
+    """
+
+    def __init__(self, task="mrpc"):
+        assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type"
+        self.pred_list = None
+        self.label_list = None
+        self.task = task
+        self.return_key = {
+            "cola": "mcc",
+            "mrpc": "acc",
+            "sts-b": "corr",
+            "qqp": "acc",
+            "mnli": "mnli/acc",
+            "qnli": "acc",
+            "rte": "acc",
+            "wnli": "acc",
+            "sst-2": "acc",
+        }
+
+    def update(self, preds, labels):
+        """add preds and labels to storage"""
+        if isinstance(preds, list) and len(preds) == 1:
+            preds = preds[0]
+        if isinstance(labels, list) and len(labels) == 1:
+            labels = labels[0]
+        if self.pred_list is None:
+            self.pred_list = preds
+            self.label_list = labels
+        else:
+            self.pred_list = np.append(self.pred_list, preds, axis=0)
+            self.label_list = np.append(self.label_list, labels, axis=0)
+
+    def reset(self):
+        """clear preds and labels storage"""
+        self.pred_list = None
+        self.label_list = None
+
+    def result(self):
+        """calculate metric"""
+        output_mode = transformers.glue_output_modes[self.task]
+        if output_mode == "classification":
+            processed_preds = np.argmax(self.pred_list, axis=1)
+        elif output_mode == "regression":
+            processed_preds = np.squeeze(self.pred_list)
+        result = transformers.glue_compute_metrics(self.task, processed_preds, self.label_list)
+        return result[self.return_key[self.task]]
+
+
+class DataReader(data_reader.CalibrationDataReader):
+    def __init__(self, model_path, dynamic_length=False, batch_size=1, calibration_sampling_size=8):
+        self.encoded_list = []
+        self.batch_size = batch_size
+        dataset = ONNXRTBertDataset(
+            args.model_path,
+            data_dir=args.data_path,
+            model_name_or_path=args.model_name_or_path,
+            max_seq_length=args.max_seq_length,
+            task=args.task,
+            model_type=args.model_type,
+            dynamic_length=args.dynamic_length,
+        )
+        dataloader = data.DataLoader(
+            dataset,
+            sampler=data.SequentialSampler(dataset),
+            batch_size=self.batch_size,
+            shuffle=False,
+        )
+        model = onnx.load(model_path, load_external_data=False)
+        inputs_names = [input.name for input in model.graph.input]
+        self.batch_size = batch_size
+
+        for idx, batch in enumerate(dataloader):
+            if idx + 1 > calibration_sampling_size:
+                break
+            ort_input = {}
+            batch_seq_length = args.max_seq_length if not args.dynamic_length else torch.max(batch[0][-2], 0)[0].item()
+            batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0])
+
+            for name, inputs in zip(inputs_names, batch):
+                ort_input[name] = inputs[:, :batch_seq_length]
+
+            self.encoded_list.append(ort_input)
+
+        self.iter_next = iter(self.encoded_list)
+
+    def get_next(self):
+        return next(self.iter_next, None)
+
+    def rewind(self):
+        self.iter_next = iter(self.encoded_list)
+
+
+if __name__ == "__main__":
+    # set config for npu test
+    provider = "DmlExecutionProvider" if args.device == "npu" else "CPUExecutionProvider"
+
+    dataset = ONNXRTBertDataset(
+        args.model_path,
+        data_dir=args.data_path,
+        model_name_or_path=args.model_name_or_path,
+        max_seq_length=args.max_seq_length,
+        task=args.task,
+        model_type=args.model_type,
+        dynamic_length=args.dynamic_length,
+    )
+    dataloader = data.DataLoader(
+        dataset,
+        sampler=data.SequentialSampler(dataset),
+        batch_size=args.batch_size,
+        shuffle=False,
+    )
+
+    def eval_func(model):
+        metric = ONNXRTGLUE(args.task)
+        session = onnxruntime.InferenceSession(model, providers=[provider])
+        ort_inputs = {}
+        len_inputs = len(session.get_inputs())
+        inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
+
+        for idx, batch in enumerate(dataloader):
+            label = batch[-1]
+            batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0])
+            batch_seq_length = args.max_seq_length if not args.dynamic_length else batch[0].shape[-1]
+            inputs = [
+                batch[0][:, :batch_seq_length],
+                batch[1][:, :batch_seq_length],
+                batch[2][:, :batch_seq_length],
+            ]
+            for i in range(len_inputs):
+                ort_inputs.update({inputs_names[i]: inputs[i]})
+            predictions = session.run(None, ort_inputs)
+            metric.update(predictions[0], label)
+        return metric.result()
+
+    if args.benchmark:
+        if args.mode == "performance":
+            total_time = 0.0
+            num_iter = 100
+            num_warmup = 10
+
+            sess_options = onnxruntime.SessionOptions()
+            sess_options.intra_op_num_threads = args.intra_op_num_threads
+            session = onnxruntime.InferenceSession(
+                args.model_path, sess_options, providers=onnxruntime.get_available_providers()
+            )
+            ort_inputs = {}
+            len_inputs = len(session.get_inputs())
+            inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
+
+            for idx, batch in enumerate(dataloader):
+                if idx + 1 > num_iter:
+                    break
+                batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0])
+                batch_seq_length = args.max_seq_length if not args.dynamic_length else batch[0].shape[-1]
+                inputs = [
+                    batch[0][:, :batch_seq_length],
+                    batch[1][:, :batch_seq_length],
+                    batch[2][:, :batch_seq_length],
+                ]
+                for i in range(len_inputs):
+                    ort_inputs.update({inputs_names[i]: inputs[i]})
+                tic = time.time()
+                predictions = session.run(None, ort_inputs)
+                toc = time.time()
+                if idx >= num_warmup:
+                    total_time += toc - tic
+
+            print("\n", "-" * 10, "Summary:", "-" * 10)
+            print(args)
+            throughput = (num_iter - num_warmup) / total_time
+            print("Throughput: {} samples/s".format(throughput))
+        elif args.mode == "accuracy":
+            acc_result = eval_func(args.model_path)
+            print("Batch size = %d" % args.batch_size)
+            print("Accuracy: %.5f" % acc_result)
+
+    if args.tune:
+        # optimize model
+        with tempfile.TemporaryDirectory(prefix="ort.opt.") as tmp_dir:
+            opt_options = FusionOptions("bert")
+            opt_options.enable_embed_layer_norm = False
+
+            model_optimizer = optimizer.optimize_model(
+                args.model_path, "bert", num_heads=12, hidden_size=768, optimization_options=opt_options
+            )
+            model = model_optimizer.model
+
+            # check the optimized model is valid
+            try:
+                onnxruntime.InferenceSession(model.SerializeToString(), providers=onnxruntime.get_available_providers())
+                onnx.save(model, pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix())
+                model = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix()
+            except Exception as e:
+                logger.warning("Optimized model is invalid: {}. ".format(e))
+                logger.warning("Model optimizer will be skipped. " "Try to upgrade onnxruntime to avoid this error")
+                model = args.model_path
+
+            calibration_data_reader = DataReader(args.model_path, calibration_sampling_size=8)
+            custom_tune_config = tuning.TuningConfig(
+                config_set=config.StaticQuantConfig.get_config_set_for_tuning(
+                    quant_format=(
+                        quantization.QuantFormat.QOperator
+                        if args.quant_format == "QOperator"
+                        else quantization.QuantFormat.QDQ
+                    ),
+                    calibration_sampling_size=8,
+                    op_types_to_quantize=["MatMul"],
+                    extra_options={"OpTypesToExcludeOutputQuantization": ["MatMul"]},
+                    execution_provider=provider,
+                )
+            )
+            best_model = tuning.autotune(
+                model_input=model,
+                tune_config=custom_tune_config,
+                eval_fn=eval_func,
+                calibration_data_reader=calibration_data_reader,
+                optimization_level=onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL,
+            )
+            onnx.save(best_model, args.output_model)
diff --git a/examples/nlp/bert/quantization/ptq_static/prepare_data.sh b/examples/nlp/bert/quantization/ptq_static/prepare_data.sh
new file mode 100644
index 000000000..c1fddb546
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_static/prepare_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  download_data
+
+}
+
+# init params
+function init_params {
+
+  for var in "$@"
+  do
+    case $var in
+      --data_dir=*)
+          data_dir=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --task_name=*)
+          task_name=$(echo "$var" |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function download_data {
+    wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py
+    python download_glue_data.py --data_dir="${data_dir}" --tasks="${task_name}"
+}
+
+main "$@"
+
diff --git a/examples/nlp/bert/quantization/ptq_static/prepare_model.py b/examples/nlp/bert/quantization/ptq_static/prepare_model.py
new file mode 100644
index 000000000..5b9216640
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_static/prepare_model.py
@@ -0,0 +1,118 @@
+import argparse
+import os
+import sys
+import urllib
+import zipfile
+
+import torch
+import transformers
+
+# Please refer to [Bert-GLUE_OnnxRuntime_quantization guide]
+# (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb)
+# for detailed model export.
+
+MODEL_URL = "https://download.pytorch.org/tutorial/MRPC.zip"
+MAX_TIMES_RETRY_DOWNLOAD = 5
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_model", type=str, required=False, default="MRPC.zip")
+    parser.add_argument("--output_model", type=str, required=True)
+    parser.add_argument("--max_len", type=int, default=128, help="Maximum length of the sentence pairs")
+    return parser.parse_args()
+
+
+def progressbar(cur, total=100):
+    percent = "{:.2%}".format(cur / total)
+    sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent))
+    sys.stdout.flush()
+
+
+def schedule(blocknum, blocksize, totalsize):
+    if totalsize == 0:
+        percent = 0
+    else:
+        percent = min(1.0, blocknum * blocksize / totalsize) * 100
+    progressbar(percent)
+
+
+def is_zip_file(filename):
+    try:
+        with open(filename, "rb") as f:
+            magic_number = f.read(4)
+            return magic_number == b"PK\x03\x04"  # ZIP file magic number
+    except OSError:
+        return False
+
+
+def extrafile(filename, target_folder="."):
+    with zipfile.ZipFile(filename, "r") as zin:
+        zin.extractall(target_folder)
+
+
+def download_model(url, model_name, retry_times=5):
+    if os.path.isdir(model_name):
+        return model_name
+    elif os.path.exists(model_name) and is_zip_file(model_name):
+        print("file downloaded")
+        extrafile(model_name)
+        return True
+
+    print("download model...")
+    retries = 0
+    while retries < retry_times:
+        try:
+            urllib.request.urlretrieve(url, model_name, schedule)
+            extrafile(model_name)
+            break
+        except KeyboardInterrupt:
+            return False
+        except:
+            retries += 1
+            print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}")
+    return retries < retry_times
+
+
+def export_model(model, output_model, max_len=128):
+    with torch.no_grad():
+        inputs = {
+            "input_ids": torch.ones(1, max_len, dtype=torch.int64),
+            "attention_mask": torch.ones(1, max_len, dtype=torch.int64),
+            "token_type_ids": torch.ones(1, max_len, dtype=torch.int64),
+        }
+
+        symbolic_names = {0: "batch_size", 1: "max_seq_len"}
+        torch.onnx.export(
+            model,  # model being run
+            (
+                inputs["input_ids"],
+                inputs["attention_mask"],
+                inputs["token_type_ids"],
+            ),  # model input (or a tuple for multiple inputs)
+            output_model,  # where to save the model (can be a file or file-like object)
+            opset_version=14,  # the ONNX version to export the model
+            do_constant_folding=True,  # whether to execute constant folding
+            input_names=["input_ids", "input_mask", "segment_ids"],  # the model's input names
+            output_names=["output"],  # the model's output names
+            dynamic_axes={
+                "input_ids": symbolic_names,  # variable length axes
+                "input_mask": symbolic_names,
+                "segment_ids": symbolic_names,
+            },
+        )
+        assert os.path.exists(output_model), f"Export failed! {output_model} doesn't exist!"
+        print("ONNX Model exported to {0}".format(output_model))
+
+
+def prepare_model(input_model, output_model, max_len):
+    is_download_successful = download_model(MODEL_URL, input_model, MAX_TIMES_RETRY_DOWNLOAD)
+    if is_download_successful:
+        folder_name = is_download_successful if isinstance(is_download_successful, str) else "./MRPC"
+        model = transformers.BertForSequenceClassification.from_pretrained(folder_name)
+        export_model(model, output_model, max_len)
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    prepare_model(args.input_model, args.output_model, args.max_len)
diff --git a/examples/nlp/bert/quantization/ptq_static/requirements.txt b/examples/nlp/bert/quantization/ptq_static/requirements.txt
new file mode 100644
index 000000000..85dc725a4
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_static/requirements.txt
@@ -0,0 +1,8 @@
+torch
+transformers
+accelerate
+onnx
+onnxruntime
+coloredlogs
+sympy
+onnxruntime-extensions
diff --git a/examples/nlp/bert/quantization/ptq_static/run_benchmark.sh b/examples/nlp/bert/quantization/ptq_static/run_benchmark.sh
new file mode 100644
index 000000000..465524f04
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_static/run_benchmark.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --input_model=*)
+          input_model=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --intra_op_num_threads=*)
+          intra_op_num_threads=$(echo "$var" |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_benchmark
+function run_benchmark {
+    if [[ ${mode} == "accuracy" ]]; then
+      dynamic_length=False
+    elif [[ ${mode} == "performance" ]]; then
+      dynamic_length=True
+    else
+      echo "Error: No such mode: ${mode}"
+      exit 1
+    fi
+
+    model_name_or_path="bert-base-uncased"
+    task_name="mrpc"
+
+    python main.py \
+           --model_path "${input_model}" \
+           --model_name_or_path "${model_name_or_path}" \
+           --data_path "${dataset_location}" \
+           --task "${task_name}" \
+           --batch_size "${batch_size}" \
+           --mode "${mode}" \
+           --intra_op_num_threads "${intra_op_num_threads-4}" \
+           --dynamic_length "${dynamic_length}" \
+           --benchmark
+            
+}
+
+main "$@"
+
diff --git a/examples/nlp/bert/quantization/ptq_static/run_quant.sh b/examples/nlp/bert/quantization/ptq_static/run_quant.sh
new file mode 100644
index 000000000..976e8e0c2
--- /dev/null
+++ b/examples/nlp/bert/quantization/ptq_static/run_quant.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_tuning
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --input_model=*)
+          input_model=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --output_model=*)
+          output_model=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo "$var" |cut -f2 -d=)
+      ;;
+      --quant_format=*)
+          quant_format=$(echo "$var" |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    model_name_or_path="bert-base-uncased"
+    batch_size=8
+    task_name="mrpc"
+    model_type="bert"
+
+    python main.py \
+           --model_path "${input_model}" \
+           --output_model "${output_model}" \
+           --model_name_or_path "${model_name_or_path}" \
+           --data_path "${dataset_location}" \
+           --task "${task_name}" \
+           --batch_size "${batch_size}" \
+           --model_type "${model_type}" \
+           --quant_format "${quant_format}" \
+           --tune
+}
+
+main "$@"
+
+
+
diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
index 9cafe62d3..572e1f010 100644
--- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
+++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
@@ -33,8 +33,8 @@
 from torch.nn import functional
 from torch.utils import data
 
-from onnx_neural_compressor import config, data_reader, logger, utility
-from onnx_neural_compressor.quantization import matmul_nbits_quantizer, tuning
+from onnx_neural_compressor import data_reader
+from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN
@@ -315,10 +315,6 @@ def rewind(self):
 
 
 if __name__ == "__main__":
-    utility.set_workspace(args.workspace)
-    if not os.path.exists(args.workspace):
-        os.mkdir(args.workspace)
-
     if args.benchmark:
         if args.mode == "performance":
             benchmark(args.model_path)
@@ -331,23 +327,11 @@ def rewind(self):
         model_name = "model.onnx"  # require optimum >= 1.14.0
         model_path = os.path.join(args.model_path, model_name)
 
-        # do graph optimization
-        logger.info("Start graph optimization...")
-        sess_options = ort.SessionOptions()
-        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
-        sess_options.optimized_model_filepath = os.path.join(args.workspace, "Optimized_model.onnx")
-        sess_options.add_session_config_entry(
-            "session.optimized_model_external_initializers_file_name", "Optimized_model.onnx_data"
-        )
-        sess_options.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "1024")
-        sess = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
-        logger.info("Graph optimization done.")
-
         best_model = None
         if args.algorithm.upper() == "RTN":
             algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig()
             quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
-                sess_options.optimized_model_filepath,
+                model_path,
                 n_bits=4,
                 block_size=32,
                 is_symmetric=True,
@@ -362,7 +346,7 @@ def rewind(self):
                 calibration_data_reader=calibration_data_reader, enable_mse_search=False
             )
             quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
-                sess_options.optimized_model_filepath,
+                model_path,
                 n_bits=4,
                 block_size=32,
                 is_symmetric=True,
@@ -377,7 +361,7 @@ def rewind(self):
                 calibration_data_reader=calibration_data_reader,
             )
             quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
-                sess_options.optimized_model_filepath,
+                model_path,
                 n_bits=4,
                 block_size=32,
                 is_symmetric=False,
diff --git a/onnx_neural_compressor/__init__.py b/onnx_neural_compressor/__init__.py
index a8e492104..2175e2eba 100644
--- a/onnx_neural_compressor/__init__.py
+++ b/onnx_neural_compressor/__init__.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-#
 # Copyright (c) 2021 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/onnx_neural_compressor/algorithms/layer_wise/core.py b/onnx_neural_compressor/algorithms/layer_wise/core.py
index 2e381cfdb..b4a665f8c 100644
--- a/onnx_neural_compressor/algorithms/layer_wise/core.py
+++ b/onnx_neural_compressor/algorithms/layer_wise/core.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-#
 # Copyright (c) 2023 MIT HAN Lab
 # This source code is licensed under the MIT license
 #
@@ -24,7 +22,7 @@
 import onnx
 import onnxruntime as ort
 
-from onnx_neural_compressor import data_reader, logger, onnx_model, utility
+from onnx_neural_compressor import data_reader, logger, onnx_model
 
 from typing import Callable, List, Union  # isort: skip
 
@@ -49,7 +47,7 @@ def layer_wise_quant(
         _type_: _description_
     """
     # check whether model shape is inferred
-    if not utility.check_model_with_infer_shapes(model):
+    if not _check_model_with_infer_shapes(model):
         logger.error(
             "Before applying layer-wise quantization, please make sure to "
             "run symbolic shape inference on your model like follows:\n"
@@ -277,3 +275,14 @@ def _prepare_data_reader_for_next_split_model(
         inputs.update({name: value for name, value in zip(output_names, out)})
         data_reader_for_next_split_model.append(inputs)
     return DataReader(data_reader_for_next_split_model)
+
+
+def _check_model_with_infer_shapes(model):
+    """Check if the model has been shape inferred."""
+    if isinstance(model, (pathlib.Path, str)):
+        model = onnx.load(model, load_external_data=False)
+    elif isinstance(model, onnx_model.ONNXModel):
+        model = model.model
+    if len(model.graph.value_info) > 0:
+        return True
+    return False
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/__init__.py b/onnx_neural_compressor/algorithms/post_training_quant/__init__.py
new file mode 100644
index 000000000..28f108cb6
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py
new file mode 100644
index 000000000..095897b49
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py
@@ -0,0 +1,637 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+"""Calibration for onnx models."""
+
+import copy
+import logging
+import os
+import sys
+from importlib import util
+
+import numpy as np
+import onnx
+import onnxruntime
+from packaging import version
+
+from onnx_neural_compressor import logger, onnx_model
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant import calibrator
+
+if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"):
+    import onnxruntime_extensions
+
+ONNX18_VERSION = version.Version("1.8.0")
+ORT112_VERSION = version.Version("1.12.0")
+
+
+class ONNXRTAugment:
+    """Augment input model to dump tensor or for calibration."""
+
+    def __init__(
+        self,
+        model_wrapper,
+        dataloader,
+        dump_op_types,
+        black_nodes=[],
+        white_nodes=[],
+        iterations=[],
+        execution_provider="CPUExecutionProvider",
+        reduce_range=False,
+        **kwargs,
+    ):
+        """Initialization.
+
+        Args:
+            model_wrapper (Model): model to be augmented
+            dataloader (object): user implemented object to read in and preprocess calibration dataset
+            dump_op_types (list): operator types to be calibrated and quantized
+            black_nodes (list, optional): operator names that should not be quantized. Defaults to [].
+            white_nodes (list, optional): operator names that force to be quantized. Defaults to [].
+            iterations (list, optional): tensor of which iteration will be collected. Defaults to [].
+            execution_provider (list, optional): execution provider for onnxruntime. Defaults to 'CPUExecutionProvider'.
+            reduce_range (bool, optional): use 7 bit or not. Defaults to False.
+        """
+        self.model_wrapper = (
+            model_wrapper
+            if isinstance(model_wrapper, onnx_model.ONNXModel)
+            else onnx_model.ONNXModel(model_wrapper, load_external_data=True)
+        )
+        self.model = self.model_wrapper.model
+        ai_onnx_domain = [opset for opset in self.model.opset_import if not opset.domain or opset.domain == "ai.onnx"]
+        self.opset_version = ai_onnx_domain[0].version
+        self.dataloader = dataloader
+        self.dump_op_types = dump_op_types
+        self.black_nodes = black_nodes
+        self.white_nodes = white_nodes
+        self.augmented_model = None
+        self.iterations = iterations
+        self.execution_provider = execution_provider
+        self.augment_nodes = []
+        self.dequantized_output = {}
+        self.already_quantized = "DequantizeLinear" in [node.op_type for node in self.model.graph.node]
+        self.dynamically_quantized = False
+        self.ort_version = version.Version(onnxruntime.__version__)
+        self.reduce_range = reduce_range
+
+    def augment_graph(self):
+        """Augment_graph.
+
+        Adds nodes to all quantization_candidates op type nodes in model and
+        ensures their outputs are stored as part of the graph output.
+
+        Args:
+            activation_only (bool, optional): whether to dump activation tensor only. Defaults to False.
+            weight_only (bool, optional): whether to dump weight_only. Defaults to False.
+        """
+        self.dequantized_output.clear()
+        onnx_version = version.Version(onnx.__version__)
+        if onnx_version < ONNX18_VERSION:
+            logger.warning("Static quantization for NLP model is supported at onnx 1.8.0 and newer.")
+        if self.already_quantized and any(
+            [i.dims in [1, 2] for i in self.model_wrapper.initializer() if i.name.endswith("_scale")]
+        ):
+            if self.opset_version < 13 and self.ort_version >= ORT112_VERSION:
+                logger.warning(
+                    "Please use onnxruntime < 1.12.0 or upgrade model opset "
+                    "version to 13 or higher to inspect per-channel quantized weight"
+                )
+
+        model = copy.deepcopy(self.model)
+        model_nodes_names = [node.name for node in model.graph.node]
+
+        added_nodes = []
+        added_outputs = []
+        tensors_to_dump = set()
+
+        for augment_node_type in self.augment_nodes:
+            if augment_node_type not in ["DequantizeLinear"]:  # pragma: no cover
+                raise ValueError(
+                    "Unexpected augment_node {} only DequantizeLinear is supported".format(augment_node_type)
+                )
+
+        if self.already_quantized:
+            # mapping between fp32 node and int8 node
+            new_white_nodes = []
+            for white_node in self.white_nodes:
+                new_white_node = white_node + "_quant"
+                assert new_white_node in model_nodes_names, "no quantized {} in the graph".format(white_node)
+                new_white_nodes.append(new_white_node)
+            self.white_nodes = new_white_nodes
+
+        node_outputs = []
+        for node in model.graph.node:  # pylint: disable=no-member
+            node_outputs.extend(node.output)
+            should_be_dump = ((node.op_type in self.dump_op_types) and (node.name not in self.black_nodes)) or (
+                node.name in self.white_nodes
+            )
+            if should_be_dump:
+                # add input tensors which should be dump
+                for input in node.input:
+                    if len(input) != 0:  # to prevent input is ""
+                        initializer_tensor = self.model_wrapper.get_initializer(input)
+                        if initializer_tensor is None:
+                            tensors_to_dump.add(input)
+                # add output tensors which should be dump
+                tensors_to_dump.update([output for output in node.output if len(output) != 0])
+
+        model_inputs = [i.name for i in model.graph.input]
+        for tensor in tensors_to_dump:
+            if tensor not in node_outputs and tensor not in model_inputs:
+                continue
+            if self.augment_nodes:
+                for augment_node_type in self.augment_nodes:
+                    if augment_node_type in ["DequantizeLinear"]:
+                        # insert DequantizeLinear node as output
+                        if tensor.endswith("_scale") or tensor.endswith("_zero_point"):  # pragma: no cover
+                            continue
+
+                        if not self.dynamically_quantized:
+                            tensor = (
+                                tensor.replace("_QuantizeInput", "_quantized")
+                                if tensor.endswith("_QuantizeInput")
+                                else tensor
+                            )
+                        else:
+                            tensor = (
+                                tensor.replace("_output_quantized", "")
+                                if tensor.endswith("_output_quantized")
+                                else tensor
+                            )
+
+                        augment_node_name = tensor + "_new_" + augment_node_type
+                        scale, zero_point = self.model_wrapper.get_scale_zero(tensor)
+                        if scale:
+                            # the tensor is in INT8 dtype
+                            nodes, output = self._dequantize(tensor, scale, zero_point)
+                            if output:
+                                added_nodes.extend(nodes)
+                                added_outputs.append(
+                                    onnx.helper.make_tensor_value_info(
+                                        output, onnx.TensorProto.FLOAT, ()  # pylint: disable=no-member
+                                    )
+                                )  # pylint: disable=no-member
+                        else:
+                            # the tensor is in FP32 dtype
+                            if tensor not in [t.name for t in model.graph.output]:
+                                added_tensor = onnx.helper.ValueInfoProto()
+                                added_tensor.name = tensor
+                                added_outputs.append(added_tensor)
+            else:
+                if tensor not in [t.name for t in model.graph.output]:
+                    added_tensor = onnx.helper.ValueInfoProto()
+                    added_tensor.name = tensor
+                    added_outputs.append(added_tensor)
+
+        if self.augment_nodes:
+            model.graph.node.extend(added_nodes)  # pylint: disable=no-member
+        model.graph.output.extend(added_outputs)  # pylint: disable=no-member
+
+        self.augmented_model = model
+        if self.model_wrapper.is_large_model:  # pragma: no cover
+            onnx.save_model(
+                model,
+                self.model_wrapper.model_path + "_augment.onnx",
+                save_as_external_data=True,
+                all_tensors_to_one_file=True,
+                convert_attribute=False,
+            )
+
+    def get_activation_tensors_calib_range(self, q_config=None):
+        """Get calib ranges of activation tensors.
+
+        Args:
+            q_config (dict, optional): quantization config. Defaults to None.
+
+        Returns:
+            dict: calib ranges
+        """
+        # conduct inference session and get intermediate outputs
+        so = onnxruntime.SessionOptions()
+        so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+        if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"):
+            so.register_custom_ops_library(onnxruntime_extensions.get_library_path())
+
+        execution_provider = (
+            self.execution_provider
+            if self.execution_provider != "TensorrtExecutionProvider"
+            else "CUDAExecutionProvider"
+        )
+        session = (
+            onnxruntime.InferenceSession(self.augmented_model.SerializeToString(), so, providers=[execution_provider])
+            if not self.model_wrapper.is_large_model
+            else onnxruntime.InferenceSession(
+                self.model_wrapper.model_path + "_augment.onnx", so, providers=[execution_provider]
+            )
+        )
+
+        len_inputs = len(session.get_inputs())
+        inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
+        len_outputs = len(session.get_outputs())
+        outputs_names = [session.get_outputs()[i].name for i in range(len_outputs)]
+
+        node_output_names = [
+            output.name if output.name not in self.dequantized_output else self.dequantized_output[output.name]
+            for output in session.get_outputs()
+        ]
+        augment_model_wrapper = (
+            onnx_model.ONNXModel(self.augmented_model, load_external_data=False)
+            if not self.model_wrapper.is_large_model
+            else onnx_model.ONNXModel(self.model_wrapper.model_path + "_augment.onnx", load_external_data=False)
+        )
+        input_name_to_nodes = augment_model_wrapper.input_name_to_nodes()
+        output_name_to_node = augment_model_wrapper.output_name_to_node()
+        name_to_node = {}
+        for data_name in node_output_names:
+            node = None
+            if data_name in output_name_to_node:
+                node = output_name_to_node[data_name]
+            elif data_name in input_name_to_nodes:
+                node = input_name_to_nodes[data_name][0]
+            assert node, "{} is neither an input nor an output of nodes in augmented model.".format(data_name)
+            name_to_node[data_name] = node.name
+
+        activation_tensors_calib_range = {}
+        intermediate_tensor = {}
+        name_to_calibrator = {}
+        ort_inputs_for_next_split_model = []
+
+        def _collect_data(inputs):
+            for output_idx, output in enumerate(session.run(None, inputs)):
+                if q_config is not None and output.size != 0:
+                    node_name = name_to_node[node_output_names[output_idx]]
+                    if node_output_names[output_idx] not in name_to_calibrator:
+                        calib_method = (
+                            q_config[node_name]["calibrate_method"] if q_config and node_name in q_config else "MinMax"
+                        )
+                        assert calib_method in calibrator.CALIBRATOR, "Calibration method {} is not registered.".format(
+                            calib_method
+                        )
+                        _calibrator = calibrator.CALIBRATOR[calib_method]()
+                    else:
+                        _calibrator = name_to_calibrator[node_output_names[output_idx]]
+
+                    # currently, the calibration range for each iteration is collected if
+                    # the calibration method is minmax, otherwise the tensor data is collected.
+                    # TODO: for entropy and percentile method, need to support range collection
+                    # per iteration in the future.
+                    if _calibrator.method_name == "MinMax":
+                        _calibrator.collect(output)
+                        activation_tensors_calib_range[node_output_names[output_idx]] = [list(_calibrator.calib_range)]
+                        name_to_calibrator[node_output_names[output_idx]] = _calibrator
+                    else:
+                        intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append(output)
+                elif q_config is None:
+                    activation_tensors_calib_range.setdefault(node_output_names[output_idx], []).append(output)
+
+        idx = 0
+        while True:
+            inputs = self.dataloader.get_next()
+            if not inputs:
+                break
+            if self.iterations != []:
+                if idx > max(self.iterations):
+                    break
+                if idx in self.iterations:
+                    _collect_data(inputs)
+            else:
+                _collect_data(inputs)
+            idx += 1
+
+        # for entropy and percentile method, collect calibration range after all tensors are collected.
+        merged_dict = intermediate_tensor
+        for (output_name, node_name), datas in merged_dict.items():
+            if any([data is None for data in datas]):
+                continue
+            if any([data.dtype in [bool] for data in datas]):  # output type of some ops is bool, skip
+                continue
+            calib_method = q_config[node_name]["calibrate_method"] if q_config and node_name in q_config else 0
+            _calibrator = calibrator.CALIBRATOR[calib_method]()
+            _calibrator.collect(datas)
+            activation_tensors_calib_range.setdefault(output_name, []).append(list(_calibrator.calib_range))
+            _calibrator.clear()
+            del _calibrator
+
+        return activation_tensors_calib_range
+
+    def get_weight_tensors_calib_range(self):
+        """Get calib ranges of weight tensors.
+
+        Returns:
+            dict: calib ranges
+        """
+        model_nodes_names = [node.name for node in self.model.graph.node]
+
+        # if augmented_model is not None, it means self.white_nodes is already updated in augment_graph func
+        # then skip update here
+        if self.already_quantized and self.augmented_model is None:
+            # mapping between fp32 node and int8 node
+            new_white_nodes = []
+            for white_node in self.white_nodes:
+                new_white_node = white_node + "_quant"
+                assert new_white_node in model_nodes_names, "no quantized {} in the " "graph".format(white_node)
+                new_white_nodes.append(new_white_node)
+            self.white_nodes = new_white_nodes
+
+        added_outputs = set()
+        initializer_tensors_to_dump = []
+        initializers = [init.name for init in self.model.graph.initializer]
+        for node in self.model.graph.node:  # pylint: disable=no-member
+            should_be_dump = ((node.op_type in self.dump_op_types) and (node.name not in self.black_nodes)) or (
+                node.name in self.white_nodes
+            )
+            if should_be_dump:
+                for input in node.input:
+                    if (
+                        (self.already_quantized and input.replace("_dequantized", "_quantized") in initializers)
+                        or (not self.already_quantized and input in initializers)
+                    ) and len(input) != 0:
+                        added_outputs.add(input)
+
+        for tensor in added_outputs:
+            if tensor not in initializers:
+                continue
+            if self.augment_nodes:
+                for augment_node_type in self.augment_nodes:
+                    if augment_node_type in ["DequantizeLinear"]:
+                        if not (tensor.endswith("_scale") or tensor.endswith("_zero_point")):
+                            initializer_tensors_to_dump.append(tensor)
+            else:
+                initializer_tensors_to_dump.append(tensor)
+
+        weight_tensors_calib_range = {}
+        for initializer_tensor_name in initializer_tensors_to_dump:
+            initializer_tensor = self.model_wrapper.get_initializer(initializer_tensor_name)
+
+            # double check initializer tensor is not None
+            if initializer_tensor is None:  # pragma: no cover
+                continue
+
+            initializer_tensor = onnx.numpy_helper.to_array(
+                initializer_tensor,
+                base_dir=(
+                    os.path.dirname(self.model_wrapper.model_path) if self.model_wrapper.model_path is not None else ""
+                ),
+            )
+            _calibrator = calibrator.CALIBRATOR["MinMax"]()  # use minmax method to calibrate initializer tensors
+            if initializer_tensor.flatten().size > 0:
+                _calibrator.collect(initializer_tensor)
+                weight_tensors_calib_range[initializer_tensor_name] = [list(_calibrator.calib_range)]
+            _calibrator.clear()
+            del _calibrator
+        return weight_tensors_calib_range
+
+    def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False):
+        """Gather intermediate model outputs after running inference."""
+        output_dicts = {}
+        if not activation_only and not weight_only:
+            output_dicts = self.get_activation_tensors_calib_range(q_config)
+            output_dicts.update(self.get_weight_tensors_calib_range())
+        elif weight_only:
+            output_dicts = self.get_weight_tensors_calib_range()
+        elif activation_only:
+            output_dicts = self.get_activation_tensors_calib_range(q_config)
+
+        return list(output_dicts.keys()), output_dicts
+
+    def _dequantize(self, tensor, scale_tensor, zo_tensor):
+        """Helper function to dequantize tensor."""
+        int_tensor = self.model_wrapper.get_initializer(tensor)
+        if int_tensor:  # weight tensor
+            return self._dequantize_weight(tensor, scale_tensor, zo_tensor)
+        else:
+            return self._dequantize_activation(tensor, scale_tensor, zo_tensor)
+
+    def _dequantize_activation(self, activation_tensor_name, scale_tensor, zo_tensor):
+        """Helper function to dequantize activation."""
+        added_nodes, added_output = self._add_dequantize_node(activation_tensor_name, scale_tensor, zo_tensor)
+        self.dequantized_output[added_output] = activation_tensor_name
+        return added_nodes, added_output
+
+    def _dequantize_weight(self, weight_tensor_name, scale_tensor, zo_tensor):
+        """Helper function to dequantize weight."""
+        weight_tensor = self.model_wrapper.get_initializer(weight_tensor_name)
+        if len(scale_tensor.dims) in [1, 2] and weight_tensor.dims[0] == max(scale_tensor.dims):
+            logger.debug("weight {} is quantized with per channel granularity.".format(weight_tensor_name))
+            if self.opset_version < 13 and self.ort_version >= ORT112_VERSION:
+                logger.warning(
+                    "Skip dequantizing weight {}, please use onnxruntime < 1.12.0 "
+                    "or upgrade model opset version to 13 or higher".format(weight_tensor_name)
+                )
+                return [], None
+            node = self.model_wrapper.input_name_to_nodes()[weight_tensor_name][0]
+            if "Conv" in node.op_type or ("Gemm" in node.op_type and quant_utils.is_B_transposed(node)):
+                added_nodes, added_output = self._add_dequantize_transpose_node(
+                    weight_tensor_name, scale_tensor, zo_tensor, len(weight_tensor.dims)
+                )
+            else:
+                added_nodes, added_output = self._add_dequantize_node(
+                    weight_tensor_name, scale_tensor, zo_tensor, axis=1 if self.opset_version > 12 else None
+                )
+        else:
+            added_nodes, added_output = self._add_dequantize_node(weight_tensor_name, scale_tensor, zo_tensor)
+        self.dequantized_output[added_output] = weight_tensor_name
+        return added_nodes, added_output
+
+    def _add_dequantize_node(self, tensor_name, scale_tensor, zo_tensor, axis=None):
+        """Helper function to generate dequantize node."""
+        dequantize_node = onnx.helper.make_node(
+            "DequantizeLinear",
+            [tensor_name, scale_tensor.name, zo_tensor.name],
+            [tensor_name + "_output"],
+            tensor_name + "_DequantizeLinear",
+            axis,
+        )
+        return [dequantize_node], tensor_name + "_output"
+
+    def _add_dequantize_transpose_node(self, tensor_name, scale_tensor, zo_tensor, dim):
+        """Insert Transpose-DequantizelLinear-Transpose pairs."""
+        pre_transpose_node = onnx.helper.make_node(
+            "Transpose",
+            inputs=[tensor_name],
+            outputs=[tensor_name + "_transposed"],
+            perm=(1, 0, 2, 3) if dim == 4 else (1, 0),
+            name=tensor_name + "_pre_transpose",
+        )
+        dequantize_node = onnx.helper.make_node(
+            "DequantizeLinear",
+            [tensor_name + "_transposed", scale_tensor.name, zo_tensor.name],
+            [tensor_name + "_DequantizeLinear"],
+            tensor_name + "_DequantizeLinear",
+            axis=1 if self.opset_version > 12 else None,
+        )
+        post_transpose_node = onnx.helper.make_node(
+            "Transpose",
+            inputs=[tensor_name + "_DequantizeLinear"],
+            outputs=[tensor_name + "_output"],
+            perm=(1, 0, 2, 3) if dim == 4 else (1, 0),
+            name=tensor_name + "_post_transpose",
+        )
+        added_nodes = [pre_transpose_node, dequantize_node, post_transpose_node]
+        return added_nodes, tensor_name + "_output"
+
+    def _map_calibration(self, node_output_names, output_dicts):
+        """Map tensor names and min/max values."""
+        merged_dict = {}
+        for name, minmaxs in output_dicts.items():
+            for minmax in minmaxs:
+                if len(minmax) < 2:
+                    continue
+                merged_dict.setdefault(name + "_Min", []).append(minmax[0])
+                merged_dict.setdefault(name + "_Max", []).append(minmax[1])
+
+        # Characterizing distribution of a node's values across test data sets
+        clean_merged_dict = dict((i, merged_dict[i]) for i in merged_dict)
+        pairs = [
+            tuple([float(min(clean_merged_dict[name + "_Min"])), float(max(clean_merged_dict[name + "_Max"]))])
+            for name in node_output_names
+        ]
+
+        final_dict = dict(zip(node_output_names, pairs))
+        return final_dict
+
+    def dump_minmax(self, q_config):
+        """Get calib ranges of tensors."""
+        # pipeline of getting calib ranges of tensors during calibration:
+        # 1. augment_graph(): insert activation tensors to model output
+        # 2. get_intermediate_outputs():
+        #   2.1 get_activation_tensors_calib_range(): get calib ranges of activation tensors using the augment graph
+        #   2.2 get_weight_tensors_calib_range(): get calib ranges of weight tensors
+        self.augment_graph()
+        node_output_names, output_dicts = self.get_intermediate_outputs(q_config)
+        return self._map_calibration(node_output_names, output_dicts)
+
+    def dump_calibration(self, q_config, min_max=None):
+        """Gather calibration params for quantization.
+
+        Args:
+            q_config (dict): op-wise quantization config
+            min_max (dict, optional): min/max values of tensors
+        """
+        return (
+            self.calculate_quantization_params(q_config, self.dump_minmax(q_config))
+            if min_max is None
+            else self.calculate_quantization_params(q_config, min_max)
+        )
+
+    def calculate_quantization_params(self, q_config, quantization_thresholds):
+        """Given quantization thresholds, calculate the quantization params.
+
+        Args:
+            q_config (dict): op-wise quantization config
+            quantization_thresholds (dict): Dictionary specifying the min and max values
+                                              or outputs of conv and matmul nodes, should be
+                                              specified in the following format:
+                                              {"param_name": [min, max]}
+        """
+        if quantization_thresholds is None:
+            raise ValueError(
+                "quantization thresholds is required to calculate quantization \
+                    params (zero point and scale)"
+            )
+
+        quantization_params = {}
+        model = self.model
+
+        input_name_to_nodes = self.model_wrapper.input_name_to_nodes()
+        output_name_to_node = self.model_wrapper.output_name_to_node()
+
+        for tensor_name in quantization_thresholds.keys():
+            child = None
+            if tensor_name in input_name_to_nodes:
+                children = input_name_to_nodes[tensor_name]
+                if len(children) == 1:
+                    child = children[0]
+            parent = None
+            sym = False
+            qType = 2  # uint8
+
+            # input and output tensor follow activation_type and activation_sym
+            if tensor_name in input_name_to_nodes and any(
+                [i.name in q_config for i in input_name_to_nodes[tensor_name]]
+            ):
+                for child in input_name_to_nodes[tensor_name]:
+                    if child.name in q_config and q_config[child.name] not in ["fp32", "fp16", "bf16"]:
+                        sym = q_config[child.name]["activation_sym"]
+                        qType = q_config[child.name]["activation_type"]
+                        break
+            elif (
+                tensor_name in output_name_to_node
+                and output_name_to_node[tensor_name].name in q_config
+                and q_config[output_name_to_node[tensor_name].name] not in ["fp32", "fp16", "bf16"]
+            ):
+                sym = q_config[output_name_to_node[tensor_name].name]["activation_sym"]
+                qType = q_config[output_name_to_node[tensor_name].name]["activation_type"]
+            if self.execution_provider in ["TensorrtExecutionProvider"]:
+                # TensorrtExecutionProvider only support int8
+                qType = 3
+            node_thresholds = quantization_thresholds[tensor_name]
+            node_params = self.calculate_scale_zeropoint(
+                parent,
+                child,
+                node_thresholds[0],
+                node_thresholds[1],
+                sym,
+                qType,
+            )
+            quantization_params[tensor_name] = node_params
+
+        return quantization_params
+
+    def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType):
+        """Given the source and destination node of tensor, return calculated zero point and scales."""
+        zp_and_scale = []
+        # adjust rmin and rmax such that 0 is included in the range. This is required
+        # to make sure zero can be uniquely represented.
+        rmin = min(rmin, 0)
+        rmax = max(rmax, 0)
+        if next_node:
+            if next_node.op_type == "Relu":
+                if rmin < 0:
+                    rmin = 0
+            elif next_node.op_type == "Clip" and len(next_node.input) == 3:
+                if self.model_wrapper.get_initializer(next_node.input[1]) is not None:
+                    clip_min = onnx.numpy_helper.to_array(self.model_wrapper.get_initializer(next_node.input[1]))
+                    if rmin < clip_min:
+                        rmin = clip_min.tolist() if not isinstance(clip_min.tolist(), list) else clip_min.tolist()[0]
+                if self.model_wrapper.get_initializer(next_node.input[2]) is not None:
+                    clip_max = onnx.numpy_helper.to_array(self.model_wrapper.get_initializer(next_node.input[2]))
+                    if rmax > clip_max:
+                        rmax = clip_max.tolist() if not isinstance(clip_max.tolist(), list) else clip_max.tolist()[0]
+
+        if last_node:
+            if last_node.op_type in ["Conv", "FusedConv"]:
+                attrs = [attr for attr in last_node.attribute]
+                attrs_names = [attr.name for attr in last_node.attribute]
+                if "activation" in attrs_names:
+                    if attrs[attrs_names.index("activation")].s == b"Relu":
+                        rmin = max(rmin, 0)
+                    if attrs[attrs_names.index("activation")].s == b"Clip":
+                        assert (
+                            "activation_params" in attrs_names
+                        ), "the model contains no params for clip node {}".format(last_node)
+                        clip_params = attrs[attrs_names.index("activation_params")].floats
+                        rmin = min(rmin, clip_params[0], clip_params[1])
+                        rmax = max(rmax, clip_params[0], clip_params[1])
+
+        scale, zp = quant_utils.calculate_scale_zp(rmin, rmax, qType, sym, self.reduce_range)
+        zp_and_scale.append(zp)
+        zp_and_scale.append(scale)
+
+        return zp_and_scale
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py
new file mode 100644
index 000000000..abef2d323
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+"""Calibrator for onnx models."""
+
+import copy
+
+import numpy as np
+from scipy import stats
+
+CALIBRATOR = {}
+
+
+def calib_registry(calib_method):
+    """The class decorator used to register all Calibrator subclasses."""
+
+    def decorator_calib(cls):
+        assert cls.__name__.endswith(
+            "Calibrator"
+        ), "The name of subclass of Calibrator should end with 'Calibrator' substring."
+        if cls.__name__[: -len("Calibrator")] in CALIBRATOR:  # pragma: no cover
+            raise ValueError("Cannot have two operators with the same name.")
+        CALIBRATOR[calib_method] = cls
+        return cls
+
+    return decorator_calib
+
+
+class CalibratorBase:
+    """Base calibrator class."""
+
+    def __init__(self):
+        """Initialize base calibrator class."""
+        self._calib_min = None
+        self._calib_max = None
+
+    def collect(self, datas):
+        """Collect calibration range."""
+        self.collect_calib_data(datas)
+
+    def clear(self):
+        """Clear calibration range."""
+        self._calib_min = None
+        self._calib_max = None
+
+    def collect_calib_data(self, datas):
+        """Collect calibration range value."""
+        raise NotImplementedError
+
+    @property
+    def calib_range(self):
+        """Get calibration range value."""
+        return self._calib_min, self._calib_max
+
+
+@calib_registry(calib_method="MinMax")
+class MinMaxCalibrator(CalibratorBase):
+    """MinMax calibrator class."""
+
+    def __init__(self):
+        """Initialize minmax calibrator class."""
+        super(MinMaxCalibrator, self).__init__()
+
+    def collect_calib_data(self, datas):
+        """Collect calibration range."""
+        if isinstance(datas, list) and len(set([data.shape for data in datas])) != 1:
+            for data in datas:
+                if data.size == 0:  # pragma: no cover
+                    continue
+                self._collect_value(data)
+        else:
+            datas = np.asarray(datas)
+            datas = datas.flatten()
+            assert datas.size > 0, "collected intermediate data size" "should not be 0, please check augmented_model"
+            self._collect_value(datas)
+
+    def _collect_value(self, data):
+        """Collect min/max value."""
+        data = np.asarray(data)
+
+        local_min = np.min(data[np.isinf(data) == False])  # noqa: E712
+        local_max = np.max(data[np.isinf(data) == False])  # noqa: E712
+        if self._calib_min is None and self._calib_max is None:
+            self._calib_min = local_min
+            self._calib_max = local_max
+        else:
+            self._calib_min = np.minimum(self._calib_min, local_min)
+            self._calib_max = np.maximum(self._calib_max, local_max)
+
+    @property
+    def method_name(self):
+        """Get calibration method name."""
+        return "MinMax"
+
+
+@calib_registry(calib_method="Percentile")
+class PercentileCalibrator(CalibratorBase):
+    """Percentile calibrator class.
+
+    Args:
+        num_bins (int, optional): number of bins to create a new histogram
+                                    for collecting tensor values. Defaults to 2048.
+        percentile (float, optional): A float number between [0, 100]. Defaults to 99.999.
+    """
+
+    def __init__(self, num_bins=2048, percentile=99.999):
+        """Initialize percentile calibrator class."""
+        super(PercentileCalibrator, self).__init__()
+        self.collector = None
+        self.num_bins = num_bins
+        self.percentile = percentile
+
+    def collect_calib_data(self, datas):
+        """Collect calibration range."""
+        if not self.collector:
+            self.collector = HistogramCollector(self.num_bins)
+        self.collector.collect_data(datas)
+        self.compute_percentile_range(self.percentile)
+
+    def compute_percentile_range(self, percentile):
+        """Compute percentile range."""
+        if percentile < 0 or percentile > 100:
+            raise ValueError("Invalid percentile. Must be in range 0 <= percentile <= 100.")
+
+        calib_hist, calib_bin_edges, min_range, max_range, th = self.collector.histogram
+        total = calib_hist.sum()
+        cdf = np.cumsum(calib_hist / total)
+        percent_to_cut_one_side = (100.0 - percentile) / 200.0
+        max_idx = np.searchsorted(cdf, 1.0 - percent_to_cut_one_side)
+        min_idx = np.searchsorted(cdf, percent_to_cut_one_side)
+        self._calib_min = calib_bin_edges[min_idx].astype("float32")
+        self._calib_max = calib_bin_edges[max_idx].astype("float32")
+        if self._calib_min < min_range:
+            self._calib_min = min_range
+        if self._calib_max > max_range:
+            self._calib_max = max_range
+
+    def clear(self):
+        """Clear calibration range."""
+        self._calib_min = None
+        self._calib_max = None
+        self.collector = None
+
+    @property
+    def method_name(self):
+        """Get calibration method name."""
+        return "Percentile"
+
+
+@calib_registry(calib_method="Entropy")
+class EntropyCalibrator(CalibratorBase):
+    """Entropy calibrator class.
+
+    Args:
+        num_bins (int, optional):number of bins to create a new histogram
+                                    for collecting tensor values. Defaults to 128.
+        num_quantized_bins (int, optional): number of quantized bins. Defaults to 128.
+    """
+
+    def __init__(self, num_bins=128, num_quantized_bins=128):
+        """Initialize entropy calibrator class."""
+        super(EntropyCalibrator, self).__init__()
+        self.collector = None
+        self.num_bins = num_bins
+        self.num_quantized_bins = num_quantized_bins
+
+    def collect_calib_data(self, datas):
+        """Collect calibration range."""
+        if not self.collector:
+            self.collector = HistogramCollector(self.num_bins)
+        self.collector.collect_data(datas)
+        self.compute_kl_range()
+
+    def compute_kl_range(self):
+        """Compute entropy range."""
+        histogram = self.collector.histogram
+        self._calib_min, self._calib_max = self.get_kl_threshold(histogram, self.num_quantized_bins)
+
+    def get_kl_threshold(self, histogram, num_quantized_bins):
+        """Compute entropy threshold.
+
+        Ref:
+        https://github.com//apache/incubator-mxnet/blob/master/python/mxnet/contrib/quantization.py
+        https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py
+
+        Args:
+            histogram (tuple): hist, hist_edges, min, max and threshold
+            num_quantized_bins (int): number of quantized bins.
+
+        Returns:
+            float: optimal threshold
+        """
+        hist = histogram[0]
+        hist_edges = histogram[1]
+        num_bins = hist.size
+        zero_bin_index = num_bins // 2
+        num_half_quantized_bin = num_quantized_bins // 2
+
+        kl_divergence = np.zeros(zero_bin_index - num_half_quantized_bin + 1)
+        thresholds = [(0, 0) for i in range(kl_divergence.size)]
+
+        for i in range(num_half_quantized_bin, zero_bin_index + 1, 1):
+            start_index = zero_bin_index - i
+            end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
+
+            thresholds[i - num_half_quantized_bin] = (
+                float(hist_edges[start_index]),
+                float(hist_edges[end_index]),
+            )
+
+            sliced_distribution = copy.deepcopy(hist[start_index:end_index])
+
+            # reference distribution p
+            p = sliced_distribution.copy()  # a copy of np array
+            left_outliers_count = sum(hist[:start_index])
+            right_outliers_count = sum(hist[end_index:])
+            p[0] += left_outliers_count
+            p[-1] += right_outliers_count
+
+            # nonzeros[i] incidates whether p[i] is non-zero
+            nonzeros = (p != 0).astype(np.int64)
+
+            # quantize p.size bins into quantized bins (default 128 bins)
+            quantized_bins = np.zeros(num_quantized_bins, dtype=np.int64)
+            num_merged_bins = sliced_distribution.size // num_quantized_bins
+
+            # merge bins into quantized bins
+            for index in range(num_quantized_bins):
+                start = index * num_merged_bins
+                end = start + num_merged_bins
+                quantized_bins[index] = sum(sliced_distribution[start:end])
+            quantized_bins[-1] += sum(sliced_distribution[num_quantized_bins * num_merged_bins :])
+
+            # in order to compare p and q, we need to make length of q equals to length of p
+            # expand quantized bins into p.size bins
+            q = np.zeros(p.size, dtype=np.int64)
+            for index in range(num_quantized_bins):
+                start = index * num_merged_bins
+                end = start + num_merged_bins
+
+                norm = sum(nonzeros[start:end])
+                if norm != 0:
+                    q[start:end] = float(quantized_bins[index]) / float(norm)
+
+            p = smooth_distribution(p)
+            q = smooth_distribution(q)
+
+            if isinstance(q, np.ndarray):
+                kl_divergence[i - num_half_quantized_bin] = stats.entropy(p, q)
+            else:
+                kl_divergence[i - num_half_quantized_bin] = float("inf")
+
+        min_kl_divergence_idx = np.argmin(kl_divergence)
+        optimal_threshold = thresholds[min_kl_divergence_idx]
+        min_value = histogram[2]
+        max_value = histogram[3]
+        if optimal_threshold[0] < min_value:
+            optimal_threshold = (min_value, optimal_threshold[1])
+        if optimal_threshold[1] > max_value:
+            optimal_threshold = (optimal_threshold[0], max_value)
+        return optimal_threshold[0], optimal_threshold[1]
+
+    def clear(self):
+        """Clear calibration range."""
+        self._calib_min = None
+        self._calib_max = None
+        self.collector = None
+
+    @property
+    def method_name(self):
+        """Get calibration method name."""
+        return "Entropy"
+
+
+class HistogramCollector:
+    """Histogram collctor class."""
+
+    def __init__(self, num_bins=2048):
+        """Initialize histogram collctor."""
+        self._num_bins = num_bins
+        self._histogram = None
+
+    def collect_data(self, datas):
+        """Collect histogram data."""
+        if isinstance(datas, list) and len(set([data.shape for data in datas])) != 1:
+            for data in datas:
+                if data.size == 0:  # pragma: no cover
+                    continue
+                self._collect_value(data)
+        else:
+            datas = np.asarray(datas)
+            datas = datas.flatten()
+            assert datas.size > 0, "collected intermediate data size" "should not be 0, please check augmented_model"
+            self._collect_value(datas)
+
+    def _collect_value(self, data):
+        """Collect value."""
+        data = np.asarray(data)
+        min_range = np.min(data)
+        max_range = np.max(data)
+
+        th = max(abs(min_range), abs(max_range))
+        if self._histogram is None:
+            hist, hist_edges = np.histogram(data, self._num_bins, range=(-th, th))
+            self._histogram = (hist, hist_edges, min_range, max_range, th)
+        else:
+            self._histogram = self.combine_histogram(self._histogram, data, min_range, max_range, th)
+
+    def combine_histogram(self, old_hist, data_arr, new_min, new_max, new_th):
+        """Combine histogram."""
+        (old_hist, old_hist_edges, old_min, old_max, old_th) = old_hist
+
+        if new_th <= old_th:
+            hist, _ = np.histogram(data_arr, bins=len(old_hist), range=(-old_th, old_th))
+            return (
+                old_hist + hist,
+                old_hist_edges,
+                min(old_min, new_min),
+                max(old_max, new_max),
+                old_th,
+            )
+        else:
+            # Need to generate new histogram with new_th
+            if old_th == 0:
+                hist, hist_edges = np.histogram(data_arr, len(old_hist), range=(-new_th, new_th))
+                hist += old_hist
+            else:
+                old_num_bins = len(old_hist)
+                old_step = 2 * old_th / old_num_bins
+                half_increased_bins = int((new_th - old_th) // old_step + 1)
+                new_num_bins = half_increased_bins * 2 + old_num_bins
+                new_th = half_increased_bins * old_step + old_th
+                hist, hist_edges = np.histogram(data_arr, bins=new_num_bins, range=(-new_th, new_th))
+                hist[half_increased_bins : new_num_bins - half_increased_bins] += old_hist
+            return (
+                hist,
+                hist_edges,
+                min(old_min, new_min),
+                max(old_max, new_max),
+                new_th,
+            )
+
+    @property
+    def histogram(self):
+        """Get histogram."""
+        return self._histogram
+
+
+def smooth_distribution(p, eps=0.0001):
+    """Smooth distribution.
+
+    Given a discrete distribution (may have not been normalized to 1),
+    smooth it by replacing zeros with eps multiplied by a scaling factor
+    and taking the corresponding amount off the non-zero values.
+    Ref:
+    http://hanj.cs.illinois.edu/cs412/bk3/KL-divergence.pdf
+    https://github.com//apache/incubator-mxnet/blob/master/python/mxnet/contrib/quantization.py
+    https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py
+
+    Args:
+        p (array): distribution array
+        eps (float, optional): a small probability. Defaults to 0.0001.
+
+    Returns:
+        array: smoothed distribution
+    """
+    is_zeros = (p == 0).astype(np.float32)
+    is_nonzeros = (p != 0).astype(np.float32)
+    n_zeros = is_zeros.sum()
+    n_nonzeros = p.size - n_zeros
+
+    if not n_nonzeros:
+        return -1
+    eps1 = eps * float(n_zeros) / float(n_nonzeros)
+    assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
+        n_zeros,
+        n_nonzeros,
+        eps1,
+    )
+
+    hist = p.astype(np.float32)
+    hist += eps * is_zeros + (-eps1) * is_nonzeros
+    assert (hist <= 0).sum() == 0
+
+    return hist
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py
new file mode 100644
index 000000000..454c3ea69
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Operators for onnx model."""
+
+import glob
+from os import path
+
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+modules = glob.glob(path.join(path.dirname(__file__), "*.py"))
+
+for f in modules:
+    if path.isfile(f) and not f.startswith("__") and not f.endswith("__init__.py"):
+        __import__(path.basename(f)[:-3], globals(), locals(), level=1)
+
+OPERATORS = base_op.OPERATORS
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py
new file mode 100644
index 000000000..c06d92dac
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Activation operator."""
+
+import onnx
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="LeakyRelu, Sigmoid", mode=[constants.STATIC_QUANT])
+class ActivationOperator(base_op.Operator):
+    """Activation operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(ActivationOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0])
+        if not data_found:
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        super().quantize()
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+
+        children = self.quantizer.model.get_children(node)
+        if len(children) == 0 or not node.name.endswith("_quant"):
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parent = self.quantizer.model.get_parents(node)[0]
+        child = self.quantizer.model.get_children(node)[0]
+
+        inputs = []
+        inputs.extend(parent.input)
+        inputs.extend(child.input[1:])
+
+        qlinear_activation_output = child.output[0]
+        kwargs = {}
+        for attribute in node.attribute:  # pragma: no cover
+            kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+        kwargs["domain"] = quant_utils.ms_domain
+
+        qlinear_activation_node = onnx.helper.make_node(
+            "QLinear" + node.op_type, inputs, [qlinear_activation_output], node.name, **kwargs
+        )
+
+        self.quantizer.new_nodes.append(qlinear_activation_node)
+        self.quantizer.remove_nodes.extend([parent, child, node])
+
+
+@base_op.op_registry(op_types="Relu, Clip", mode=[constants.STATIC_QUANT])
+class RemovableActivationOperator(base_op.Operator):
+    """Removable activation operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(RemovableActivationOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        if node.input[0] not in self.quantizer.quantized_value_map:
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantization."""
+        node = self.node
+        if node.output[0] in [i.name for i in self.quantizer.model.model.graph.output]:
+            self.quantizer.dequantize_tensor(node, node.input[0])
+        else:
+            self.quantizer.model.replace_input_of_all_nodes(node.output[0], node.input[0])
+            self.quantizer.remove_nodes.append(node)
+
+
+@base_op.op_registry(
+    op_types="Softmax, BiasGelu, Elu, Exp, FastGelu, Gelu, Softplus, Tanh", mode=[constants.STATIC_QUANT]
+)
+class Float16ActivationOperator(base_op.Operator):
+    """Float16 Activation operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(Float16ActivationOperator, self).__init__(onnx_quantizer, onnx_node)
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py
new file mode 100644
index 000000000..594e24c05
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ArgMax operator."""
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="ArgMax", mode=[constants.STATIC_QUANT])
+class ArgMaxOperator(base_op.Operator):
+    """ArgMax operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(ArgMaxOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        return True
+
+    def convert(self):
+        """Convert to quantized format."""
+        node = self.node
+        origin_name = node.input[0].split("_argmax_node")[0]
+
+        if origin_name in self.quantizer.quantized_value_map:
+            node.name = node.name + "_quant"
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py
new file mode 100644
index 000000000..46f102352
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Attention operator."""
+
+import onnx
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="Attention", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT])
+class AttentionOperator(base_op.Operator):
+    """Attention operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(AttentionOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node, [0, 1])
+        node.name = node.name + "_quant"
+
+    def convert(self):
+        """Convert QDQ mode to QOperator format."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        quantized_name = []
+        scale = []
+        zp = []
+        for parent in parents[:2]:
+            if parent.op_type == "DynamicQuantizeLinear":
+                quantized_name.append(parent.output[0])
+                scale.append(parent.output[1])
+                zp.append(parent.output[2])
+            elif parent.op_type == "DequantizeLinear":
+                quantized_name.append(parent.input[0])
+                scale.append(parent.input[1])
+                zp.append(parent.input[2])
+                self.quantizer.remove_nodes.append(parent)
+
+        inputs = []
+        inputs.extend(quantized_name)
+        inputs.append(node.input[2])
+        inputs.extend(scale)
+        inputs.append(node.input[3] if len(node.input) > 3 else "")
+        inputs.extend(zp)
+        if len(node.input) > 4:
+            inputs.append(node.input[4])
+
+        kwargs = {}
+        for attribute in node.attribute:  # pragma: no cover
+            kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+        kwargs["domain"] = quant_utils.ms_domain
+        qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, node.name, **kwargs)
+        self.quantizer.new_nodes.append(qattention_node)
+
+        self.quantizer.remove_nodes.append(node)
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py
new file mode 100644
index 000000000..a5d3bc62d
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Base Operator."""
+
+from onnx_neural_compressor import constants, quantization
+
+OPERATORS = {
+    "dynamic_quant": {},
+    "static_quant": {},
+}
+
+
+def op_registry(op_types, mode):
+    """The class decorator used to register all Operator subclasses."""
+
+    def decorator_op(cls):
+        assert cls.__name__.endswith(
+            "Operator"
+        ), "The name of subclass of Operator should end with 'Operator' substring."
+        for item in mode:
+            if cls.__name__[: -len("Operator")] in OPERATORS[item]:  # pragma: no cover
+                raise ValueError("Cannot have two operators with the same name for {} mode.".format(item))
+                break
+        for single_op_type in [op_type.strip() for op_type in op_types.split(",")]:
+            for item in mode:
+                OPERATORS[item][single_op_type] = cls
+        return cls
+
+    return decorator_op
+
+
+class Operator(object):
+    """Base Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        self.quantizer = onnx_quantizer
+        self.node = onnx_node
+        node_name = self.node.name.split("_quant")[0]
+        if node_name in self.quantizer.config:
+            self.dtype = self.quantizer.config[node_name]
+        self.disable_qdq_for_node_output = (
+            True if onnx_node.op_type in onnx_quantizer.optypes_to_exclude_output_quant else False
+        )
+        self.per_channel = False
+        self.calibrate_method = 0  # minmax
+        self.weight_sym = True
+        self.weight_dtype = None
+        self.activation_dtype = None
+        self.activation_sym = False
+        if node_name in self.quantizer.config:
+            if self.quantizer.config[node_name] not in self.quantizer.fallback_list:
+                self.per_channel = self.quantizer.config[node_name]["per_channel"]
+                self.calibrate_method = self.quantizer.config[node_name]["calibrate_method"]
+                self.weight_sym = self.quantizer.config[node_name]["weight_sym"]
+                self.weight_dtype = self.quantizer.config[node_name]["weight_type"]
+                self.activation_dtype = self.quantizer.config[node_name]["activation_type"]
+                self.activation_sym = self.quantizer.config[node_name]["activation_sym"]
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node)
+        if not self.disable_qdq_for_node_output or self.quantizer.mode != constants.DYNAMIC_QUANT:
+            self.quantizer.quantize_outputs(node)
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+
+        if not node.name.endswith("_quant"):
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        return
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py
new file mode 100644
index 000000000..4aa1637b7
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Binary operator."""
+
+import onnx
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="Add, Mul", mode=[constants.STATIC_QUANT])
+class BinaryOperator(base_op.Operator):
+    """Binary operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(BinaryOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0])
+        if not data_found:
+            return False
+        if self.quantizer.execution_provider == "TensorrtExecutionProvider":
+            return True
+        if not all([self.quantizer.is_valid_quantize_weight(i) for i in node.input]):
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node, initializer_use_weight_qType=False)
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(node)
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        children = self.quantizer.model.get_children(node)
+        if len(children) == 0 or not node.name.endswith("_quant"):
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        child = self.quantizer.model.get_children(node)[0]
+
+        qlinear_binary_math_output = child.output[0]
+
+        kwargs = {}
+        for attribute in node.attribute:  # pragma: no cover
+            kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+        kwargs["domain"] = quant_utils.ms_domain
+
+        qlinear_binary_math_inputs = []
+        for parent in parents:
+            qlinear_binary_math_inputs.extend(parent.input)
+        qlinear_binary_math_inputs.extend(child.input[1:])
+
+        qlinear_binary_math_node = onnx.helper.make_node(
+            "QLinear" + node.op_type, qlinear_binary_math_inputs, [qlinear_binary_math_output], node.name, **kwargs
+        )
+
+        self.quantizer.new_nodes += [qlinear_binary_math_node]
+        self.quantizer.remove_nodes.extend(parents)
+        self.quantizer.remove_nodes.append(child)
+        self.quantizer.remove_nodes.append(node)
+
+
+@base_op.op_registry(op_types="Mod", mode=[constants.STATIC_QUANT])
+class BinaryDirect8BitOperator(base_op.Operator):
+    """Binary operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(BinaryDirect8BitOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0])
+        if not data_found:
+            return False
+        if not all([self.quantizer.is_valid_quantize_weight(i) for i in node.input]):
+            return False
+
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node, initializer_use_weight_qType=False)
+        if not self.disable_qdq_for_node_output or self.quantizer.mode != "qdq":
+            self.quantizer.quantize_outputs(node)
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+
+        children = self.quantizer.model.get_children(node)
+        if len(children) == 0 or not node.name.endswith("_quant"):
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+        if any([i.op_type == "DequantizeLinear" for i in parents]) and any(
+            [i.op_type == "QuantizeLinear" for i in children]
+        ):
+            for idx, parent in enumerate(parents):
+                if parent.op_type == "DequantizeLinear":
+                    self.node.input[idx] = parent.input[0]
+                    self.quantizer.remove_nodes.append(parent)
+            for child in children:
+                if child.op_type == "QuantizeLinear":
+                    self.quantizer.remove_nodes.append(child)
+                    self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized")
+            node.output[0] = node.output[0] + "_quantized"
+
+
+@base_op.op_registry(
+    op_types="Sum, Sub, Div, Pow, Equal, Greater, GreaterOrEqual, Less, LessOrEqual", mode=[constants.STATIC_QUANT]
+)
+class Float16BinaryOperator(base_op.Operator):
+    """Float16 Binary operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(Float16BinaryOperator, self).__init__(onnx_quantizer, onnx_node)
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py
new file mode 100644
index 000000000..9e0f0ff6b
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Concat Operator."""
+
+import onnx
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="Concat", mode=[constants.STATIC_QUANT])
+class ConcatOperator(base_op.Operator):
+    """Concat Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(ConcatOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        if len(node.input) == 1:  # pragma: no cover
+            return False
+        inits = [i.name for i in self.quantizer.model.initializer()]
+        if all([inp not in self.quantizer.quantized_value_map and inp not in inits for inp in node.input]) or not all(
+            [inp in self.quantizer.quantized_value_map or inp in inits for inp in node.input]
+        ):
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        inits = [i.name for i in self.quantizer.model.initializer()]
+        for idx, inp in enumerate(node.input):
+            initializer_use_weight_qType = inp not in inits
+            self.quantizer.quantize_inputs(node, [idx], initializer_use_weight_qType)
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(node)
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+        if len(children) == 0 or len(parents) == 0 or not node.name.endswith("_quant"):
+            return False
+
+        # check input type
+        if all([i.op_type == "DequantizeLinear" for i in parents]) and any(
+            [i.op_type == "QuantizeLinear" for i in children]
+        ):
+            input_zp, input_scale, output_zp = [], [], []
+            input_zp = [parent.input[2] for parent in parents]
+            input_scale = [parent.input[1] for parent in parents]
+            output_zp = [child.input[2] for child in children if child.op_type == "QuantizeLinear"]
+
+            if (
+                any([self.quantizer.model.get_initializer(zp) is None for zp in input_zp])
+                or any([self.quantizer.model.get_initializer(zp) is None for zp in output_zp])
+                or any([self.quantizer.model.get_initializer(scale) is None for scale in input_scale])
+            ):  # pragma: no cover
+                return False
+
+            # check input scale is float type
+            if any(
+                [self.quantizer.model.get_initializer(scale).data_type != 1 for scale in input_scale]
+            ):  # pragma: no cover
+                return False
+            # check input zp type is the same with output zp type
+            if any(
+                [
+                    self.quantizer.model.get_initializer(in_zp).data_type
+                    not in [self.quantizer.model.get_initializer(out_zp).data_type for out_zp in output_zp]
+                    for in_zp in input_zp
+                ]
+            ):
+                return False
+
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+
+        if all([i.op_type == "DequantizeLinear" for i in parents]) and any(
+            [i.op_type == "QuantizeLinear" for i in children]
+        ):
+            inputs = []
+
+            inputs.extend([i for i in children if i.op_type == "QuantizeLinear"][0].input[1:])
+            for parent in parents:
+                inputs.extend(parent.input)
+                self.quantizer.remove_nodes.append(parent)
+            for child in children:
+                if child.op_type == "QuantizeLinear":
+                    self.quantizer.remove_nodes.append(child)
+                    self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized")
+
+            kwargs = {}
+            for attribute in node.attribute:
+                kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+            kwargs["domain"] = quant_utils.ms_domain
+            qlconcat_node = onnx.helper.make_node(
+                "QLinearConcat", inputs, [node.output[0] + "_quantized"], node.name, **kwargs
+            )
+
+            self.quantizer.new_nodes += [qlconcat_node]
+            self.quantizer.remove_nodes.append(node)
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py
new file mode 100644
index 000000000..ede7e1bfa
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conv Operator."""
+
+
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from onnx_neural_compressor import constants
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="Conv, FusedConv", mode=[constants.DYNAMIC_QUANT])
+class ConvOperator(base_op.Operator):
+    """Conv Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(ConvOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        if node.op_type == "FusedConv":
+            kwargs = {}
+            for attribute in node.attribute:
+                if attribute.name == "activation" and attribute.s in [b"Relu", b"Clip"]:
+                    continue
+                if attribute.name == "activation_params":
+                    continue
+                kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+            conv = onnx.helper.make_node("Conv", node.input, node.output, node.name, **kwargs)
+            node.CopyFrom(conv)
+
+        self.quantizer.quantize_inputs(node, [0])
+
+        if self.per_channel:
+            self.quantizer.quantize_weights_per_channel(node, [1], self.weight_dtype, self.weight_sym, 0)
+        else:
+            self.quantizer.quantize_inputs(node, [1])
+
+        if len(node.input) == 3:
+            self.quantizer.quantize_bias_tensor(node)
+
+        node.name = node.name + "_quant"
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+        inputs = []
+        parents = self.quantizer.model.get_parents(node)
+        if parents[0].op_type == "QuantizeLinear":
+            inputs.append(parents[0].output[0])
+            inputs.append(parents[1].input[0])
+            inputs.append(parents[0].input[2])
+            inputs.append(parents[1].input[2])
+            scale_0 = parents[0].input[1]
+        else:
+            inputs.append(parents[0].output[0])
+            inputs.append(parents[1].input[0])
+            inputs.append(parents[0].output[2])
+            inputs.append(parents[1].input[2])
+            scale_0 = parents[0].output[1]
+        scale_1 = parents[1].input[1]
+        # quantize bias if exist
+        quantized_bias_name = ""
+        bias_present = False
+        if len(node.input) == 3:
+            quantized_bias_name = node.input[2] + "_quantized"
+            bias_present = True
+
+        conv_integer_output = node.output[0] + "_output_quantized"
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name == "activation" and attribute.s in [b"Relu", b"Clip"]:  # pragma: no cover
+                continue
+            if attribute.name == "activation_params":  # pragma: no cover
+                continue
+            kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+        conv_integer_node = onnx.helper.make_node("ConvInteger", inputs, [conv_integer_output], node.name, **kwargs)
+        self.quantizer.new_nodes.append(conv_integer_node)
+
+        # Add bias add nodes
+        if bias_present:
+            conv_integer_output = self.quantizer.get_bias_add_nodes(
+                node, parents[1].input[0], conv_integer_output, quantized_bias_name
+            )
+
+        # Add cast operation to cast convInteger output to float.
+        cast_op_output = conv_integer_output + "_cast_output"
+        cast_node = onnx.helper.make_node(
+            "Cast",
+            [conv_integer_output],
+            [cast_op_output],
+            conv_integer_output + "_cast",
+            to=onnx_proto.TensorProto.FLOAT,
+        )
+        self.quantizer.new_nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        scales_mul_op = node.name + "_scales_mul"
+
+        scales_mul_node = quant_utils.find_by_name(scales_mul_op, self.quantizer.new_nodes)
+        if scales_mul_node is None:
+            scales_mul_node = onnx.helper.make_node("Mul", [scale_0, scale_1], [scales_mul_op + ":0"], scales_mul_op)
+            self.quantizer.new_nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        # Add mul operation to multiply mul_scales_op result with output of ConvInteger
+        # and make the output of this node the same as output of original conv node.
+        output_scale_mul_op = node.name + "_output_scale_mul"
+        self.quantizer.new_nodes.append(
+            onnx.helper.make_node("Mul", [cast_op_output, scales_mul_op_output], [node.output[0]], output_scale_mul_op)
+        )
+        self.quantizer.remove_nodes.extend(parents[1:])
+        self.quantizer.remove_nodes.append(node)
+
+
+@base_op.op_registry(op_types="Conv, FusedConv", mode=[constants.STATIC_QUANT])
+class StaticConvOperator(ConvOperator):
+    """Conv Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(ConvOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        if node.op_type == "FusedConv":
+            kwargs = {}
+            for attribute in node.attribute:
+                if attribute.name == "activation" and attribute.s in [b"Relu", b"Clip"]:
+                    continue
+                if attribute.name == "activation_params":
+                    continue
+                kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+            conv = onnx.helper.make_node("Conv", node.input, node.output, node.name, **kwargs)
+            node.CopyFrom(conv)
+
+        self.quantizer.quantize_inputs(node, [0])
+
+        if self.per_channel:
+            self.quantizer.quantize_weights_per_channel(node, [1], self.weight_dtype, self.weight_sym, 0)
+        else:
+            self.quantizer.quantize_inputs(node, [1])
+
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(node)
+
+        if len(node.input) == 3:
+            self.quantizer.quantize_bias_tensor(node)
+
+        node.name = node.name + "_quant"
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        if len(self.quantizer.model.get_children(node)) == 0 or not node.name.endswith("_quant"):  # pragma: no cover
+            return
+        parents = self.quantizer.model.get_parents(node)
+        child = self.quantizer.model.get_children(node)[0]
+        qlinear_conv_inputs = []
+        for parent in parents[0:2]:
+            qlinear_conv_inputs.extend(parent.input)
+        qlinear_conv_inputs.extend(child.input[1:])
+        if len(parents) == 3:
+            qlinear_conv_inputs.append(parents[-1].input[0])
+
+        qlinear_conv_output = child.output[0]
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name == "activation" and attribute.s in [b"Relu", b"Clip"]:  # pragma: no cover
+                continue
+            if attribute.name == "activation_params":  # pragma: no cover
+                continue
+            kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+
+        qlinear_conv_node = onnx.helper.make_node(
+            "QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], node.name, **kwargs
+        )
+        self.quantizer.new_nodes.append(qlinear_conv_node)
+        self.quantizer.remove_nodes.extend(parents)
+        self.quantizer.remove_nodes.append(child)
+        self.quantizer.remove_nodes.append(node)
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py
new file mode 100644
index 000000000..77d09793b
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Direct8Bit Operator."""
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(
+    op_types="Reshape, Transpose, Squeeze, Unsqueeze, Flatten, Expand, Slice, "
+    "SpaceToDepth, DepthToSpace, Upsample, Tile, CenterCropPad",
+    mode=[constants.STATIC_QUANT],
+)
+class Direct8BitOperator(base_op.Operator):
+    """Direct8Bit Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(Direct8BitOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        if not self.quantizer.is_valid_quantize_weight(node.input[0]):
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(self.node, [0], direct_int8=True)
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(self.node, direct_int8=True)
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+        if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"):
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+        if any([i.op_type == "DequantizeLinear" for i in parents]) and any(
+            [i.op_type == "QuantizeLinear" for i in children]
+        ):
+            for parent in parents:
+                if parent.op_type == "DequantizeLinear":
+                    # make sure parent DequantizeLinear of input 0 is not used by other ops
+                    if len(self.quantizer.model.get_children(parent)) == 1 and not self.quantizer.model.is_graph_output(
+                        parents[0].output[0]
+                    ):
+                        self.quantizer.remove_nodes.append(parent)
+                    self.node.input[0] = parent.input[0]
+                    break
+            for child in children:
+                if child.op_type == "QuantizeLinear":
+                    self.quantizer.remove_nodes.append(child)
+                    self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized")
+            node.output[0] = node.output[0] + "_quantized"
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py
new file mode 100644
index 000000000..0b9967f3d
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""EmbedLayerNormalization Operator."""
+
+import onnx
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="EmbedLayerNormalization", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT])
+class EmbedLayerNormalizationOperator(base_op.Operator):
+    """EmbedLayerNormalization Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(EmbedLayerNormalizationOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node, [2, 3, 4, 5, 6])
+        node.name = node.name + "_quant"
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parents = [i for i in self.quantizer.model.get_parents(node) if i.op_type == "DequantizeLinear"]
+        inputs = []
+        # 'input_ids'
+        inputs.extend([node.input[0]])
+        # 'segment_ids'
+        inputs.extend([node.input[1]])
+        for parent in parents:
+            inputs.append(parent.input[0])
+        # 'mask' (optional)
+        if len(node.input) > 7:
+            inputs.append(node.input[7])
+
+        for parent in parents:
+            inputs.append(parent.input[1])
+        for parent in parents:
+            inputs.append(parent.input[2])
+
+        kwargs = {}
+        for attribute in node.attribute:  # pragma: no cover
+            kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+        kwargs["domain"] = quant_utils.ms_domain
+
+        qembed_layer_norm_node = onnx.helper.make_node(
+            "QEmbedLayerNormalization", inputs, node.output, node.name, **kwargs
+        )
+        self.quantizer.new_nodes.append(qembed_layer_norm_node)
+        self.quantizer.remove_nodes.extend(parents)
+        self.quantizer.remove_nodes.append(node)
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py
new file mode 100644
index 000000000..e337125b2
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Gather Operator."""
+
+import onnx
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(
+    op_types="Gather, GatherElements, GatherND", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT]
+)
+class GatherOperator(base_op.Operator):
+    """Gather Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(GatherOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        if not self.quantizer.is_valid_quantize_weight(node.input[0]):
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node, [0], initializer_use_weight_qType=False)
+        if not self.disable_qdq_for_node_output or self.quantizer.mode != constants.DYNAMIC_QUANT:
+            self.quantizer.quantize_outputs(node)
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+        if len(children) == 0 or len(parents) == 0 or not node.name.endswith("_quant"):
+            return False
+
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        # DQ-Gather-Q-DQ-op
+        node = self.node
+
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+
+        if any([i.op_type == "DequantizeLinear" for i in parents]):
+
+            inputs = []
+            inputs.append(parents[0].input[0])
+            inputs.append(node.input[1])
+
+            out_scale = 1.0
+            out_zp = 0
+            gather_new_output = node.output[0] + "_quantized"  # dynamic quant output name
+            for child in children:
+                if child.op_type == "QuantizeLinear":
+                    out_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[1]))
+                    out_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[2]))
+                    gather_new_output = children[0].output[0]  # static quant output name
+                    self.quantizer.remove_nodes.append(child)
+
+            kwargs = {}
+            for attribute in node.attribute:  # pragma: no cover
+                kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+
+            gather_node = onnx.helper.make_node(node.op_type, inputs, [gather_new_output], node.name, **kwargs)
+            self.quantizer.new_nodes.append(gather_node)
+            if any([i.op_type != "QuantizeLinear" for i in children]):
+                dq_inputs = []
+                dq_inputs.append(gather_new_output)
+                dq_inputs.extend(parents[0].input[1:])
+                dq_node = onnx.helper.make_node(
+                    "DequantizeLinear", dq_inputs, [node.output[0]], node.name + "_DequantizeLinear"
+                )
+                self.quantizer.new_nodes.append(dq_node)
+
+            # int8 weight will be recalculated for the first time
+            if (
+                any([child.op_type == "QuantizeLinear" for child in children])
+                and self.quantizer.model.get_initializer(parents[0].input[0]) is not None
+                and parents[0].input[0] not in self.quantizer.recalculate_quantized_value
+            ):
+                int8_tensor = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0]))
+                in_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1]))
+                in_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2]))
+                new_int8_tensor = (((int8_tensor.astype("float32") - in_zp) * in_scale) / out_scale).round() + out_zp
+                self.quantizer.model.set_initializer(parents[0].input[0], new_int8_tensor.astype(int8_tensor.dtype))
+                self.quantizer.recalculate_quantized_value.append(parents[0].input[0])
+            self.quantizer.remove_nodes.extend([node, parents[0]])
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py
new file mode 100644
index 000000000..a91c1e531
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GlobalAveragePool Operator."""
+
+import onnx
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="GlobalAveragePool", mode=[constants.STATIC_QUANT])
+class GlobalAveragePoolOperator(base_op.Operator):
+    """GlobalAveragePool Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(GlobalAveragePoolOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        children = self.quantizer.model.get_children(node)
+        if len(children) == 0:  # pragma: no cover
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parent = self.quantizer.model.get_parents(node)[0]
+        child = self.quantizer.model.get_children(node)[0]
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+        kwargs["domain"] = quant_utils.ms_domain
+        kwargs["channels_last"] = 0
+
+        inputs = parent.input
+        inputs.extend(child.input[1:])
+
+        qnode = onnx.helper.make_node("QLinear" + node.op_type, inputs, child.output, node.name + "_quant", **kwargs)
+        self.quantizer.new_nodes += [qnode]
+        self.quantizer.remove_nodes.append(child)
+        self.quantizer.remove_nodes.append(parent)
+        self.quantizer.remove_nodes.append(node)
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py
new file mode 100644
index 000000000..8d0b61c73
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Gemm Operator."""
+
+import onnx
+
+from onnx_neural_compressor import constants, logger
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="Gemm", mode=[constants.STATIC_QUANT])
+class GemmOperator(base_op.Operator):
+    """Gemm Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(GemmOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        if len(node.input) == 3 and not quant_utils.find_by_name(node.input[2], self.quantizer.model.initializer()):
+
+            logger.warning(
+                "Bias of Gemm node '{}' is not constant. "
+                "Exclude this node can get better performance.".format(node.name)
+            )
+            if self.quantizer.quant_format != "qdq":
+                return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node, [0])
+        if self.per_channel and quant_utils.find_by_name(node.input[1], self.quantizer.model.initializer()):
+            self.quantizer.quantize_weights_per_channel(
+                node, [1], self.weight_dtype, self.weight_sym, 0 if quant_utils.is_B_transposed(node) else 1
+            )
+        else:
+            self.quantizer.quantize_inputs(node, [1])
+
+        if len(node.input) == 3 and quant_utils.find_by_name(node.input[2], self.quantizer.model.initializer()):
+            self.quantizer.quantize_bias_tensor(node)
+            beta_attribute = [attr for attr in node.attribute if attr.name == "beta"]
+            if len(beta_attribute):
+                beta_attribute[0].f = 1.0
+
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(node)
+        node.name = node.name + "_quant"
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parents = self.quantizer.model.get_parents(node)
+        qgemm_inputs = []
+        for parent in parents[:-1]:
+            qgemm_inputs.extend(parent.input)
+        qgemm_inputs.append(parents[-1].input[0])
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name != "beta":
+                kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+                kwargs["domain"] = quant_utils.ms_domain
+
+        qgemm_output = node.output[0]
+        if not self.disable_qdq_for_node_output:
+            child = self.quantizer.model.get_children(node)[0]
+            self.quantizer.remove_nodes.append(child)
+            qgemm_output = child.output[0]
+            qgemm_inputs.extend(child.input[1:])
+        qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], node.name, **kwargs)
+
+        self.quantizer.new_nodes.append(qgemm_node)
+        self.quantizer.remove_nodes.extend(parents)
+        self.quantizer.remove_nodes.append(node)
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py
new file mode 100644
index 000000000..8499f2441
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LSTM Operator."""
+
+import numpy
+import onnx
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="LSTM", mode=[constants.DYNAMIC_QUANT])
+class LSTMOperator(base_op.Operator):
+    """LSTM Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(LSTMOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        """Do quantizaion."""
+        return
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+
+        if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
+            node.input[2]
+        ):  # pragma: no cover
+            return False
+
+        model = self.quantizer.model
+        W = model.get_initializer(node.input[1])
+        R = model.get_initializer(node.input[2])
+
+        if len(W.dims) != 3 or len(R.dims) != 3:  # pragma: no cover
+            return False
+
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        model = self.quantizer.model
+        W = model.get_initializer(self.node.input[1])
+        R = model.get_initializer(self.node.input[2])
+
+        [W_num_dir, W_4_hidden_size, W_input_size] = W.dims
+        [R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims
+
+        if self.per_channel:  # pragma: no cover
+            del W.dims[0]
+            del R.dims[0]
+            W.dims[0] = W_num_dir * W_4_hidden_size
+            R.dims[0] = R_num_dir * R_4_hidden_size
+
+        quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
+            node.input[1], self.weight_dtype, self.weight_sym, 0
+        )
+        quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
+            node.input[2], self.weight_dtype, self.weight_sym, 0
+        )
+
+        W_quant_weight = model.get_initializer(quant_input_weight_tuple[0])
+        R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0])
+
+        W_quant_array = onnx.numpy_helper.to_array(W_quant_weight)
+        R_quant_array = onnx.numpy_helper.to_array(R_quant_weight)
+
+        W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size))
+        R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size))
+
+        W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1))
+        R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1))
+
+        W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0])
+        R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0])
+
+        model.remove_initializers([W_quant_weight, R_quant_weight])
+        model.add_initializer(W_quant_tranposed)
+        model.add_initializer(R_quant_tranposed)
+
+        W_quant_zp = model.get_initializer(quant_input_weight_tuple[1])
+        R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1])
+        W_quant_scale = model.get_initializer(quant_input_weight_tuple[2])
+        R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2])
+
+        if self.per_channel:  # pragma: no cover
+            W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
+            R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size]
+            W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size]
+            R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size]
+
+        inputs = []
+        input_len = len(node.input)
+        inputs.extend([node.input[0]])
+        inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]])
+        inputs.extend([node.input[3] if input_len > 3 else ""])
+        inputs.extend([node.input[4] if input_len > 4 else ""])
+        inputs.extend([node.input[5] if input_len > 5 else ""])
+        inputs.extend([node.input[6] if input_len > 6 else ""])
+        inputs.extend([node.input[7] if input_len > 7 else ""])
+        inputs.extend(
+            [
+                quant_input_weight_tuple[2],
+                quant_input_weight_tuple[1],
+                quant_recurrent_weight_tuple[2],
+                quant_recurrent_weight_tuple[1],
+            ]
+        )
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name == "layout":
+                continue
+            kwarg = quant_utils.attribute_to_kwarg(attribute)
+            kwargs.update(kwarg)
+
+        quant_lstm_name = node.name + "_quant"
+        quant_lstm_node = onnx.helper.make_node(
+            "DynamicQuantizeLSTM", inputs, node.output, quant_lstm_name, domain="com.microsoft", **kwargs
+        )
+        self.quantizer.remove_nodes.append(node)
+        self.quantizer.new_nodes.append(quant_lstm_node)
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py
new file mode 100644
index 000000000..eff98f533
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MatMul Operator."""
+
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from onnx_neural_compressor import constants
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="MatMul", mode=[constants.DYNAMIC_QUANT])
+class MatMulOperator(base_op.Operator):
+    """MatMul Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(MatMulOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        if not all([self.quantizer.model.get_initializer(i) is None for i in node.input]):
+            return True
+        elif all([i not in self.quantizer.quantized_value_map for i in node.input]):
+            return False
+        else:
+            return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node, [0])
+        if self.per_channel and quant_utils.find_by_name(node.input[1], self.quantizer.model.initializer()):
+            self.quantizer.quantize_weights_per_channel(node, [1], self.weight_dtype, self.weight_sym, 1)
+        else:
+            self.quantizer.quantize_inputs(node, [1])
+
+        node.name = node.name + "_quant"
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parents = self.quantizer.model.get_parents(node)
+
+        inputs = []
+        quantized_name = []
+        scale = []
+        zp = []
+        for parent in parents:
+            if parent.op_type == "DequantizeLinear":
+                quantized_name.append(parent.input[0])
+            else:
+                quantized_name.append(parent.output[0])
+            if parent.op_type == "DynamicQuantizeLinear":
+                scale.append(parent.output[1])
+                zp.append(parent.output[2])
+            else:
+                scale.append(parent.input[1])
+                zp.append(parent.input[2])
+        inputs.extend(quantized_name)
+        inputs.extend(zp)
+        matmul_integer_output = node.output[0] + "_output_quantized"
+        matmul_integer_node = onnx.helper.make_node("MatMulInteger", inputs, [matmul_integer_output], node.name)
+        self.quantizer.new_nodes.append(matmul_integer_node)
+
+        # Add cast operation to cast matmulInteger output to float.
+        cast_op_output = matmul_integer_output + "_cast_output"
+        cast_node = onnx.helper.make_node(
+            "Cast",
+            [matmul_integer_output],
+            [cast_op_output],
+            matmul_integer_output + "_cast",
+            to=onnx_proto.TensorProto.FLOAT,
+        )
+        self.quantizer.new_nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        scales_mul_op = node.name + "_scales_mul"
+
+        scales_mul_node = quant_utils.find_by_name(scales_mul_op, self.quantizer.new_nodes)
+        if scales_mul_node is None:
+            scales_mul_node = onnx.helper.make_node("Mul", [scale[0], scale[1]], [scales_mul_op + ":0"], scales_mul_op)
+            self.quantizer.new_nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        # Add mul operation to multiply mul_scales_op result with output of MatMulInteger
+        # and make the output of this node the same as output of original matmul node.
+        output_scale_mul_op = node.name + "_output_scale_mul"
+        self.quantizer.new_nodes.append(
+            onnx.helper.make_node("Mul", [cast_op_output, scales_mul_op_output], [node.output[0]], output_scale_mul_op)
+        )
+        if parents[1].op_type == "DequantizeLinear":
+            self.quantizer.remove_nodes.append(parents[1])
+        self.quantizer.remove_nodes.append(node)
+
+
+@base_op.op_registry(op_types="MatMul", mode=[constants.STATIC_QUANT])
+class StaticMatMulOperator(MatMulOperator):
+    """MatMul Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(MatMulOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node, [0])
+        if self.per_channel and quant_utils.find_by_name(node.input[1], self.quantizer.model.initializer()):
+            self.quantizer.quantize_weights_per_channel(node, [1], self.weight_dtype, self.weight_sym, 1)
+        else:
+            self.quantizer.quantize_inputs(node, [1])
+
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(node)
+        node.name = node.name + "_quant"
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        if len(self.quantizer.model.get_children(node)) == 0 or not node.name.endswith("_quant"):  # pragma: no cover
+            return
+
+        qlinear_matmul_inputs = []
+        if self.disable_qdq_for_node_output:
+            for i in range(len(parents[0].input)):
+                qlinear_matmul_inputs.extend([parent.input[i] for parent in parents])
+            qlinear_matmul_node = onnx.helper.make_node(
+                "MatMulIntegerToFloat", qlinear_matmul_inputs, node.output, node.name, domain="com.microsoft"
+            )
+        else:
+            # after inserting QDQ, MatMul -> Q-DQ-MatMul-Q-DQ
+            for parent in parents:
+                qlinear_matmul_inputs.extend(parent.input)
+
+            child = self.quantizer.model.get_children(node)[0]
+            qlinear_matmul_output = child.output[0]
+            qlinear_matmul_inputs.extend(child.input[1:])
+            qlinear_matmul_node = onnx.helper.make_node(
+                "QLinearMatMul", qlinear_matmul_inputs, [qlinear_matmul_output], node.name
+            )
+            self.quantizer.remove_nodes.append(child)
+        self.quantizer.new_nodes.append(qlinear_matmul_node)
+        self.quantizer.remove_nodes.append(node)
+
+        # make sure parent DequantizeLinear of input 0 is not used by other ops
+        if len(self.quantizer.model.get_children(parents[0])) == 1 and not self.quantizer.model.is_graph_output(
+            parents[0].output[0]
+        ):
+            self.quantizer.remove_nodes.extend(parents)
+        else:
+            self.quantizer.remove_nodes.append(parents[1])
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py
new file mode 100644
index 000000000..cd5119c13
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MaxPool Operator."""
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="MaxPool", mode=[constants.STATIC_QUANT])
+class MaxPoolOperator(base_op.Operator):
+    """MaxPool Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(MaxPoolOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        # if opset version is less than 12, just no change
+        if self.quantizer.opset_version < 12:  # pragma: no cover
+            return False
+
+        if not self.quantizer.is_valid_quantize_weight(node.input[0]):  # pragma: no cover
+            return False
+
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(self.node, direct_int8=True)
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(self.node, direct_int8=True)
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        children = self.quantizer.model.get_children(node)
+        if len(children) == 0 or not node.name.endswith("_quant"):  # pragma: no cover
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+        parent = self.quantizer.model.get_parents(node)[0]
+        children = self.quantizer.model.get_children(node)
+        if parent.op_type != "DequantizeLinear" or all(
+            [i.op_type != "QuantizeLinear" for i in children]
+        ):  # pragma: no cover
+            return
+        node.input[0] = parent.input[0]
+        node.output[0] = node.output[0].replace("_QuantizeInput", "_quantized")
+        for child in children:
+            if child.op_type == "QuantizeLinear":
+                self.quantizer.remove_nodes.append(child)
+                for n in self.quantizer.model.get_children(child):
+                    self.quantizer.model.replace_node_input(n, child.output[0], node.output[0])
+
+        self.quantizer.remove_nodes.append(parent)
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py
new file mode 100644
index 000000000..6ffe742b5
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pad Operator."""
+
+import onnx
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="Pad", mode=[constants.STATIC_QUANT])
+class PadOperator(base_op.Operator):
+    """Pad Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(PadOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        # if opset version is less than 11, just no change
+        if self.quantizer.opset_version < 11:  # pragma: no cover
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node, [0])
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(node)
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        children = self.quantizer.model.get_children(node)
+        if len(children) == 0 or not node.name.endswith("_quant"):  # pragma: no cover
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parent = self.quantizer.model.get_parents(node)[0]
+        child = self.quantizer.model.get_children(node)[0]
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kv = quant_utils.attribute_to_kwarg(attribute)
+            kwargs.update(kv)
+
+        if "mode" not in kwargs or kwargs["mode"] == b"constant":
+            if len(node.input) > 2:  # There is 3rd input 'constant_value'
+                zp_tensor = self.quantizer.model.get_initializer(parent.input[2])
+                scale_tensor = self.quantizer.model.get_initializer(parent.input[1])
+
+                padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
+                if padding_constant_initializer is not None:
+                    zp_array = onnx.numpy_helper.to_array(zp_tensor)
+                    zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
+                    scale_array = onnx.numpy_helper.to_array(scale_tensor)
+                    scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
+                    padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
+                    quantized_padding_constant_array = quant_utils.quantize_nparray(
+                        onnx.helper.tensor_dtype_to_np_dtype(self.weight_dtype),
+                        padding_constant_array,
+                        scale_value,
+                        zp_value,
+                    )
+                    quantized_padding_constant_name = node.input[2] + "_quantized"
+                    quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
+                        quantized_padding_constant_array, quantized_padding_constant_name
+                    )
+                    # Suppose this padding constant initializer only used by the node
+                    self.quantizer.model.remove_initializer(padding_constant_initializer)
+                    self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
+                    node.input[2] = quantized_padding_constant_name
+                else:
+                    self.quantizer.quantize_inputs(node, [2], False)
+                    node.input[2] = node.input[2] + "_DequantizeLinear"
+            else:
+                # pad zero_point for original zero
+                node.input.extend([parent.input[2]])
+
+        # Create an entry for output quantized value
+        node.input[0] = parent.input[0]
+        node.output[0] = child.output[0]
+        self.quantizer.remove_nodes.extend([parent, child])
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py
new file mode 100644
index 000000000..fb97ce630
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AveragePool Operator."""
+
+import onnx
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="AveragePool", mode=[constants.STATIC_QUANT])
+class PoolOperator(base_op.Operator):
+    """AveragePool Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(PoolOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        if not self.quantizer.is_valid_quantize_weight(node.input[0]):
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        super().quantize()
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+
+        if len(children) == 0 or len(parents) == 0 or not node.name.endswith("_quant"):
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+
+        if all([i.op_type == "DequantizeLinear" for i in parents]) and any(
+            [i.op_type == "QuantizeLinear" for i in children]
+        ):
+            qlinear_output_name = node.output[0] + "_quantized"
+            inputs = []
+            inputs.extend(parents[0].input)
+            inputs.extend([i for i in children if i.op_type == "QuantizeLinear"][0].input[1:])
+            kwargs = {}
+            for attribute in node.attribute:
+                kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+            kwargs["domain"] = quant_utils.ms_domain
+            qnode = onnx.helper.make_node("QLinear" + node.op_type, inputs, [qlinear_output_name], node.name, **kwargs)
+
+            self.quantizer.remove_nodes.extend(parents)
+            for child in children:
+                if child.op_type == "QuantizeLinear":
+                    self.quantizer.remove_nodes.append(child)
+                    self.quantizer.model.replace_input_of_all_nodes(child.output[0], qnode.output[0])
+
+            self.quantizer.new_nodes.append(qnode)
+            self.quantizer.remove_nodes.append(node)
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py
new file mode 100644
index 000000000..f89000e2e
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Reduce Operator."""
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(
+    op_types="ReduceMean, ReduceLogSum, ReduceLogSumExp, " "ReduceL1, ReduceL2, ReduceProd, ReduceSum, ReduceSumSquare",
+    mode=[constants.STATIC_QUANT],
+)
+class ReduceOperator(base_op.Operator):
+    """Reduce Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(ReduceOperator, self).__init__(onnx_quantizer, onnx_node)
+
+
+@base_op.op_registry(op_types="ReduceMax, ReduceMin", mode=[constants.STATIC_QUANT])
+class ReduceMinMaxOperator(base_op.Operator):
+    """ReduceMin and ReduceMax Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(ReduceMinMaxOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        if not self.quantizer.is_valid_quantize_weight(node.input[0]):
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(self.node, [0], direct_int8=True)
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(self.node, direct_int8=True)
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+        if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"):
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+        if any([i.op_type == "DequantizeLinear" for i in parents]) and any(
+            [i.op_type == "QuantizeLinear" for i in children]
+        ):
+            for parent in parents:
+                if parent.op_type == "DequantizeLinear":
+                    self.node.input[0] = parent.input[0]
+                    self.quantizer.remove_nodes.append(parents[0])
+                    break
+            for child in children:
+                if child.op_type == "QuantizeLinear":
+                    self.quantizer.remove_nodes.append(child)
+                    self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized")
+            node.output[0] = node.output[0] + "_quantized"
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py
new file mode 100644
index 000000000..0cba83441
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Resize Operator."""
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="Resize", mode=[constants.STATIC_QUANT])
+class ResizeOperator(base_op.Operator):
+    """Resize Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(ResizeOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        # if version is less than 11, just keep this node
+        if self.quantizer.opset_version < 11:
+            return False
+        if not self.quantizer.is_valid_quantize_weight(node.input[0]):
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node, [0], direct_int8=True)
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(self.node, direct_int8=True)
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+        if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"):
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+
+        if any([i.op_type == "DequantizeLinear" for i in parents]) and any(
+            [i.op_type == "QuantizeLinear" for i in children]
+        ):
+            for parent in parents:
+                if parent.op_type == "DequantizeLinear" and parent.output[0] == node.input[0]:
+                    self.node.input[0] = parent.input[0]
+                    self.quantizer.remove_nodes.append(parent)
+                    break
+            for child in children:
+                if child.op_type == "QuantizeLinear":
+                    self.quantizer.remove_nodes.append(child)
+                    self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized")
+            node.output[0] = node.output[0] + "_quantized"
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py
new file mode 100644
index 000000000..3192b51d1
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Split Operator."""
+
+import onnx
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="Split", mode=[constants.STATIC_QUANT])
+class SplitOperator(base_op.Operator):
+    """Split Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(SplitOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0])
+        if not data_found:
+            return False
+        if not all([self.quantizer.is_valid_quantize_weight(i) for i in node.input]):
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(node, [0])
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(self.node, direct_int8=True)
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        parent = self.quantizer.model.get_parents(node)[0]
+        children = self.quantizer.model.get_children(node)
+        if (
+            parent.op_type != "DequantizeLinear" or len(children) == 0 or not node.name.endswith("_quant")
+        ):  # pragma: no cover
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parent = self.quantizer.model.get_parents(node)[0]
+        kwargs = {}
+        for attribute in node.attribute:  # pragma: no cover
+            kwargs.update(quant_utils.attribute_to_kwarg(attribute))
+
+        quantized_input_names = []
+        quantized_input_names.append(parent.input[0])
+        if len(node.input) > 1:  # pragma: no cover
+            quantized_input_names.extend(node.input[1:])
+        outputs = []
+        input_name_to_nodes = self.quantizer.model.input_name_to_nodes()
+        for output in node.output:
+            if output in input_name_to_nodes:
+                child = input_name_to_nodes[output][0]
+                if child.op_type == "QuantizeLinear":
+                    self.quantizer.remove_nodes.append(child)
+                    outputs.append(child.output[0])
+                else:  # pragma: no cover
+                    outputs.append(output)
+            else:  # pragma: no cover
+                outputs.append(output + "_quantized")
+
+        quantized_node = onnx.helper.make_node(node.op_type, quantized_input_names, outputs, node.name, **kwargs)
+        self.quantizer.new_nodes.append(quantized_node)
+        self.quantizer.remove_nodes.extend([parent, node])
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py
new file mode 100644
index 000000000..87c402b99
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unary operator."""
+
+from onnx_neural_compressor import constants, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+@base_op.op_registry(op_types="Exp, Log, Round, Sqrt", mode=[constants.STATIC_QUANT])
+class UnaryOperator(base_op.Operator):
+    """Unary operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(UnaryOperator, self).__init__(onnx_quantizer, onnx_node)
+
+
+@base_op.op_registry(op_types="Abs, Shrink, Sign", mode=[constants.STATIC_QUANT])
+class UnaryDirect8BitOperator(base_op.Operator):
+    """Unary operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(UnaryDirect8BitOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def quantize_check(self):
+        """Check if quantizaion can be done."""
+        node = self.node
+        if not self.quantizer.is_valid_quantize_weight(node.input[0]):
+            return False
+        return True
+
+    def quantize(self):
+        """Do quantizaion."""
+        node = self.node
+        self.quantizer.quantize_inputs(self.node, [0], direct_int8=True)
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_outputs(self.node, direct_int8=True)
+        node.name = node.name + "_quant"
+
+    def convert_check(self):
+        """Check if conversion can be done."""
+        node = self.node
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+        if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"):
+            return False
+        return True
+
+    def convert(self):
+        """Convert to QOperator format."""
+        node = self.node
+
+        parents = self.quantizer.model.get_parents(node)
+        children = self.quantizer.model.get_children(node)
+        if any([i.op_type == "DequantizeLinear" for i in parents]) and any(
+            [i.op_type == "QuantizeLinear" for i in children]
+        ):
+            for parent in parents:
+                if parent.op_type == "DequantizeLinear":
+                    self.node.input[0] = parent.input[0]
+                    self.quantizer.remove_nodes.append(parents[0])
+                    break
+            for child in children:
+                if child.op_type == "QuantizeLinear":
+                    self.quantizer.remove_nodes.append(child)
+                    self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized")
+            node.output[0] = node.output[0] + "_quantized"
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py b/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py
new file mode 100644
index 000000000..8fb49d2d0
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py
@@ -0,0 +1,1246 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Quantizer for onnx models."""
+
+import copy
+import logging
+import os
+
+import numpy as np
+import onnx
+import onnxruntime as ort
+
+from onnx_neural_compressor import logger, onnx_model, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
+
+
+class Quantizer:
+    """Quantizer class."""
+
+    def __init__(
+        self,
+        model,
+        q_config,
+        mode,
+        static,
+        quantization_params,
+        op_types_to_quantize,
+        fallback_list=["fp32"],
+        reduce_range=None,
+        add_qdq_pair_to_weight=False,
+        optypes_to_exclude_output_quant=[],
+        dedicated_qdq_pair=False,
+        execution_provider="CPUExecutionProvider",
+    ):
+        """Initialization.
+
+        Args:
+            model (ModelProto or onnx_model.ONNXModel): onnx model or onnx model wrapper by neural compressor
+            q_config (dict): op-wise quantization config.
+            mode (str): quantizaion mode
+            static (bool): static or not
+            quantization_params (dict): scale and zero point of tensors
+            op_types_to_quantize (list): optypes to quantize
+            fallback_list (list, optional): fallback data type. Defaults to ['fp32'].
+            reduce_range (bool, optional): use 7 bit or not. Defaults to None.
+            add_qdq_pair_to_weight (bool, optional): add QDQ pair to weight or not. Defaults to False.
+            optypes_to_exclude_output_quant (list, optional): optypes to exclude output quantization. Defaults to [].
+            dedicated_qdq_pair (bool, optional): dedicate QDQ pair or not. Defaults to False.
+            execution_provider (str, optional): execution_provider of onnxrt adaptor. Defaults to CPUExecutionProvider
+        """
+        self.model = onnx_model.ONNXModel(model) if not isinstance(model, onnx_model.ONNXModel) else model
+        model = (
+            onnx.shape_inference.infer_shapes(self.model.model) if not self.model.is_large_model else self.model.model
+        )
+        self.config = q_config
+        self.execution_provider = execution_provider
+        self.reduce_range = reduce_range
+        self.mode = mode
+        self.quant_format = None
+        self.static = static  # use static quantization for inputs.
+        self.fuse_dynamic_quant = False
+        self.quantization_params = quantization_params
+        self.op_types_to_quantize = op_types_to_quantize
+        self.fallback_list = fallback_list
+        self.new_nodes = []
+
+        self.opset_version = self.check_opset_version()
+        self.value_infos = {vi.name: vi for vi in model.graph.value_info}
+        self.value_infos.update({ot.name: ot for ot in model.graph.output})
+        self.value_infos.update({it.name: it for it in model.graph.input})
+        self.replace_input = []
+        self.remove_nodes = []
+        # List of quantized weights
+        self.quantized_value_map = {}
+        self.new_value_info = {}
+
+        # List of recalculated quantize weight for Gather op.
+        self.recalculate_quantized_value = []
+
+        # QuantizeRange tensor name and zero tensor name for scale and zero point calculation.
+        # Used when static is False
+        self.fixed_qrange_uint8_name = "fixed_quantization_range_uint8"
+        self.fixed_qrange_int8_name = "fixed_quantization_range_int8"
+        # For uint8 data-type, to compute zero point, we subtract rmin from 0 (represented by fixed_zero_name tensor)
+        self.fixed_zero_name = "fixed_zero"
+        # For int8 data-type, zero point is always zero (represented by fixed_zero_point_name tensor)
+        self.fixed_zero_zp_name = "fixed_zero_zp"
+
+        if not self.static:
+            self.optypes_to_exclude_output_quant = op_types_to_quantize
+        else:
+            self.optypes_to_exclude_output_quant = optypes_to_exclude_output_quant
+
+        self.add_qdq_pair_to_weight = add_qdq_pair_to_weight
+        self.dedicated_qdq_pair = dedicated_qdq_pair
+
+    def check_opset_version(self):
+        """Check opset version."""
+        ai_onnx_domain = [
+            opset for opset in self.model.model.opset_import if not opset.domain or opset.domain == "ai.onnx"
+        ]
+        if 1 != len(ai_onnx_domain):
+            raise ValueError("Failed to find proper ai.onnx domain")
+        opset_version = ai_onnx_domain[0].version
+
+        if opset_version > 10:
+            self.fuse_dynamic_quant = True
+        elif opset_version < 10:
+            logger.warning(
+                f"Warning: The original model opset version is {opset_version}, which does not support node "
+                + "fusions. Please update the model to opset >= 11 for better performance."
+            )
+            self.model.model.opset_import.remove(ai_onnx_domain[0])
+            self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)])
+            opset_version = 11
+
+        return opset_version
+
+    def should_quantize(self, node):
+        """Check if node should be quantized."""
+        if node.name in self.config and self.config[node.name] not in self.fallback_list:
+            return True
+        elif (
+            quant_utils.get_node_original_name(node) in self.config
+            and self.config[quant_utils.get_node_original_name(node)] not in self.fallback_list
+        ):
+            return True
+        else:
+            return False
+
+    def should_convert(self, node):
+        """Check if node should be converted."""
+        name = quant_utils.get_node_original_name(node)
+        if name in self.config and self.config[name] not in self.fallback_list:
+            return True
+        else:
+            return False
+
+    def _postprocess(self):
+        if "TensorrtExecutionProvider" in self.execution_provider:
+            utility.trt_env_setup(self.model.model)
+        self.merge_dedicated_qdq_pair()
+        self.model.remove_unused_nodes()
+
+        self.model.model.producer_name = quant_utils.__producer__
+        self.model.model.producer_version = quant_utils.__version__
+
+    def _preprocess(self):
+        quant_utils.remove_init_from_model_input(self.model)
+        quant_utils.split_shared_bias(self.model)
+
+    def quantize_model(self):
+        """Quantize onnx model."""
+        self._preprocess()
+
+        # step 1: insert q-dq pairs
+        self.insert_qdq()
+
+        self.remove_duplicate_qdq_paris()
+
+        # step 2: convert q-node-dq to qoperator format if needed
+        if self.quant_format != "qdq":
+            self.convert_qdq_to_operator_oriented()
+
+        self._postprocess()
+        quant_utils.dump_model_op_stats(self.model.model, self.config, self.op_types_to_quantize)
+        return self.model.model
+
+    def merge_dedicated_qdq_pair(self):
+        """Merge dedicated Q/DQ pairs."""
+        self.remove_nodes = []
+        self.replace_input = []
+        self.new_nodes = []
+        if self.quant_format == "qdq" and self.dedicated_qdq_pair:
+            #    node         node
+            #     |           / \
+            #     q     ->   q   q
+            #    / \        /     \
+            #  dq   dq    dq       dq
+            for node in self.model.nodes():
+                if node.op_type in ["QuantizeLinear"]:
+                    children = self.model.get_children(node)
+                    if len([i for i in children if i.op_type in ["DequantizeLinear"]]) < 2:
+                        continue
+                    for idx, child in enumerate(children):
+                        if child.op_type not in ["DequantizeLinear"]:
+                            continue
+                        if self.should_quantize(self.model.get_children(child)[0]):
+                            inputs = [self.model.get_parents(node)[0].output[0], node.input[1], node.input[2]]
+                            self.new_nodes.append(
+                                onnx.helper.make_node(
+                                    "QuantizeLinear",
+                                    inputs,
+                                    [node.output[0] + "_" + str(idx)],
+                                    node.name + "_" + str(idx),
+                                )
+                            )
+                            self.replace_input.append([child, node.output[0], node.output[0] + "_" + str(idx)])
+                        else:
+                            self.remove_nodes.append(child)
+                            self.replace_input.append(
+                                [self.model.get_children(child)[0], child.output[0], node.input[0]]
+                            )
+                    self.remove_nodes.append(node)
+            self.model.remove_nodes(self.remove_nodes)
+            self.model.graph().node.extend(self.new_nodes)
+            for node, old_input_name, new_input_name in self.replace_input:
+                self.model.replace_node_input(node, old_input_name, new_input_name)
+            self.model.update()
+
+        elif self.quant_format != "qdq" or not self.dedicated_qdq_pair:
+            #      node            node
+            #      /  \      ->     |
+            #   q(dq)  q(dq)      q(dq)
+            target_type = ["QuantizeLinear", "DequantizeLinear"]
+            for op_type in target_type:
+                for node in self.model.nodes():
+                    children = self.model.get_children(node)
+                    dq_nodes = [i for i in children if i.op_type == op_type]
+                    if len(dq_nodes) < 2 or node.op_type in ["Split"]:
+                        continue
+                    datas = []
+                    for n in dq_nodes:
+                        datas.append(
+                            [
+                                onnx.numpy_helper.to_array(
+                                    quant_utils.find_by_name(n.input[1], self.model.initializer())
+                                ),
+                                onnx.numpy_helper.to_array(
+                                    quant_utils.find_by_name(n.input[2], self.model.initializer())
+                                ),
+                            ]
+                        )
+                    for idx, data in enumerate(datas):
+                        repeaded_id = [i for i, item in enumerate(datas[idx:]) if item == data]
+                        for i in repeaded_id[1:]:
+                            self.remove_nodes.append(dq_nodes[i])
+                            self.replace_input.append(
+                                [
+                                    self.model.get_children(dq_nodes[i])[0],
+                                    dq_nodes[i].output[0],
+                                    dq_nodes[idx].output[0],
+                                ]
+                            )
+                self.model.remove_nodes(self.remove_nodes)
+                self.model.graph().node.extend(self.new_nodes)
+                for node, old_input_name, new_input_name in self.replace_input:
+                    self.model.replace_node_input(node, old_input_name, new_input_name)
+                self.model.update()
+
+    def remove_duplicate_qdq_paris(self):
+        """Remove duplicated qdq pairs."""
+        self.remove_nodes = []
+        for node in self.model.nodes():
+            if node.op_type == "DequantizeLinear":
+                matched_parents = self.model.match_parent_path(
+                    node,
+                    ["QuantizeLinear", "DequantizeLinear", "QuantizeLinear"],
+                    [None, None, None],
+                )
+
+                if matched_parents is not None:
+                    # (node) DQ - (matched_parents) Q-DQ-Q
+                    if all(
+                        [i.op_type == "QuantizeLinear" for i in self.model.get_children(matched_parents[1])]
+                    ) and not self.model.is_graph_output(matched_parents[1].output[0]):
+                        self.remove_nodes.append(matched_parents[1])
+                    if all([i.op_type == "DequantizeLinear" for i in self.model.get_children(matched_parents[0])]):
+                        self.remove_nodes.append(matched_parents[0])
+                        self.replace_input.append([node, node.input[0], matched_parents[2].output[0]])
+
+        self.model.remove_nodes(self.remove_nodes)
+        for node, old_input_name, new_input_name in self.replace_input:
+            self.model.replace_node_input(node, old_input_name, new_input_name)
+
+    def insert_qdq(self):
+        """Insert Q/DQ pairs."""
+        for node in self.model.nodes():
+            if self.should_quantize(node):
+                op_quantizer = base_op.OPERATORS[self.mode][node.op_type](self, node)
+                if op_quantizer.quantize_check():
+                    op_quantizer.quantize()
+        self.model.graph().node.extend(self.new_nodes)
+        self.model.remove_nodes(self.remove_nodes)
+
+        for node, old_input_name, new_input_name in self.replace_input:
+            self.model.replace_node_input(node, old_input_name, new_input_name)
+        self.model.update()
+
+    def convert_qdq_to_operator_oriented(self):
+        """Convert QDQ to QOperator format."""
+        self.new_nodes = []
+        self.remove_nodes = []
+        self.replace_input = []
+        for node in self.model.nodes():
+            if node.op_type not in ["QuantizeLinear", "DequantizeLinear"] and self.should_convert(node):
+                op_converter = base_op.OPERATORS[self.mode][node.op_type](self, node)
+                if op_converter.convert_check():
+                    op_converter.convert()
+        self.model.graph().node.extend(self.new_nodes)
+        self.model.remove_nodes(self.remove_nodes)
+        for node, old_input_name, new_input_name in self.replace_input:
+            self.model.replace_node_input(node, old_input_name, new_input_name)
+        self.model.update()
+
+    def quantize_bias_tensor(self, node):
+        """Quantize bias."""
+        input_name, weight_name, bias_name = node.input
+        if (
+            self.quantization_params is None
+            or input_name not in self.quantization_params
+            or input_name not in self.quantized_value_map
+            or (
+                input_name in self.quantized_value_map
+                and quant_utils.find_by_name(self.quantized_value_map[input_name].scale_name, self.model.initializer())
+                is None
+            )
+        ):
+            self._dynamic_quantize_bias(input_name, weight_name + "_scale", bias_name, bias_name + "_quantized")
+        else:
+            beta = 1.0
+            if node.op_type in ["Gemm"]:
+                beta_attribute = [attr for attr in node.attribute if attr.name == "beta"]
+                if len(beta_attribute):
+                    beta = onnx.helper.get_attribute_value(beta_attribute[0])
+            _, quant_value = self.quantize_bias(bias_name, input_name, weight_name, beta)
+            if self.model.get_initializer_share_num(bias_name) == 1:
+                self.model.remove_initializer(quant_utils.find_by_name(bias_name, self.model.initializer()))
+            inputs = [quant_value.q_name, quant_value.scale_name, quant_value.zp_name]
+            axis = None
+            if quant_utils.find_by_name(weight_name + "_DequantizeLinear", self.new_nodes):
+                dq_node = quant_utils.find_by_name(weight_name + "_DequantizeLinear", self.new_nodes)
+                if dq_node.op_type == "DequantizeLinear" and quant_utils.find_by_name("axis", dq_node.attribute):
+                    axis = quant_utils.find_by_name("axis", dq_node.attribute).i
+            dequant_node = onnx.helper.make_node(
+                "DequantizeLinear",
+                inputs,
+                [bias_name + "_dequantized"],
+                bias_name + "_DequantizeLinear",
+                axis=axis,
+            )
+            self.new_nodes.append(dequant_node)
+            self.replace_input.append(
+                [quant_utils.find_by_name(node.name, self.model.nodes()), bias_name, bias_name + "_dequantized"]
+            )
+
+    def quantize_bias(self, bias_name, input_name, weight_name, beta=1.0):
+        """Quantized the bias.
+
+        Zero Point == 0 and Scale == Input_Scale * Weight_Scale
+        """
+        # get scale for weight
+        weight_scale_initializer = quant_utils.find_by_name(weight_name + "_scale", self.model.initializer())
+        weight_scale = (
+            self.tensor_proto_to_array(weight_scale_initializer, os.path.dirname(self.model.model_path))
+            if self.model.model_path is not None
+            else self.tensor_proto_to_array(weight_scale_initializer)
+        )
+
+        # get bias
+        bias_initializer = quant_utils.find_by_name(bias_name, self.model.initializer())
+        bias_data = (
+            self.tensor_proto_to_array(bias_initializer, os.path.dirname(self.model.model_path))
+            if self.model.model_path is not None
+            else self.tensor_proto_to_array(bias_initializer)
+        )
+        quantized_bias_name = bias_name + "_quantized"
+
+        if input_name in self.quantized_value_map:
+            input_scale_name = self.quantized_value_map[input_name].scale_name
+        elif input_name in self.quantization_params:
+            _, input_scale_name, _, _, _ = self._get_quantization_params(input_name)
+        else:
+            raise ValueError(f"Expected {input_name} to be in quantized value map for static quantization")
+        inputscale_initializer = quant_utils.find_by_name(input_scale_name, self.model.initializer())
+        input_scale = (
+            self.tensor_proto_to_array(inputscale_initializer, os.path.dirname(self.model.model_path))
+            if self.model.model_path is not None
+            else self.tensor_proto_to_array(inputscale_initializer)
+        )
+
+        # calculate scale for bias
+
+        bias_scale = input_scale * weight_scale * beta
+
+        # quantize bias
+        quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
+
+        # update bias initializer
+        bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
+        packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
+        self.model.initializer().extend([packed_bias_initializer])
+
+        # update scale initializer
+        quantized_bias_scale_name = bias_name + "_scale"
+        bias_scale_data = np.asarray(bias_scale, dtype=np.float32).reshape(-1)
+        packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
+        self.model.initializer().extend([packed_bias_scale_initializer])
+
+        # update zero initializer
+        quantized_bias_zp_name = bias_name + "_zero_point"
+        bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
+        packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
+        self.model.initializer().extend([packed_bias_zp_initializer])
+
+        quantized_value = quant_utils.QuantizedValue(
+            bias_name,
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            None,
+            onnx.TensorProto.INT32,
+        )
+        return quantized_bias_name, quantized_value
+
+    def quantize_weight_per_channel(self, weight_name, weight_qType, sym, channel_axis):
+        """Quantize weight per-channel."""
+        name = (
+            ("_").join([weight_name, str(weight_qType)])
+            if self.model.get_initializer_share_num(weight_name) > 1
+            else weight_name
+        )
+        if name in self.quantized_value_map:
+            return (name + "_quantized", name + "_zero_point", name + "_scale")
+
+        initializer = quant_utils.find_by_name(weight_name, self.model.initializer())
+        if initializer is None:
+            raise ValueError("{} is not an initializer", weight_name)
+
+        weights = (
+            self.tensor_proto_to_array(initializer, os.path.dirname(self.model.model_path))
+            if self.model.model_path is not None
+            else self.tensor_proto_to_array(initializer)
+        )
+        rmin, rmax, zero_point, scale, quantized_weights = quant_utils.quantize_data_per_channel(
+            weights,
+            channel_axis,
+            weight_qType,
+            sym,
+            self.reduce_range,
+        )
+
+        weight = quant_utils.QuantizedInitializer(
+            name,
+            initializer,
+            rmin,
+            rmax,
+            zero_point,
+            scale,
+            weights,
+            quantized_weights.flatten().tolist(),
+            channel_axis,
+            weight_qType,
+        )
+
+        self._update_weight(weight)
+        quantized_value = quant_utils.QuantizedValue(
+            weight.name,
+            weight.name + "_quantized",
+            weight.name + "_scale",
+            weight.name + "_zero_point",
+            None,
+            weight_qType,
+        )
+        self.quantized_value_map[weight.name] = quantized_value
+
+        return (weight.name + "_quantized", weight.name + "_zero_point", weight.name + "_scale")
+
+    def dequantize_tensor(self, node, value_name):
+        """Dequantize tensor."""
+        if value_name in self.quantized_value_map:
+            quantized_value = self.quantized_value_map[value_name]
+            dqlinear_name = value_name + "_DequantizeLinear"
+            dqlinear_inputs = [value_name + "_quantized", quantized_value.scale_name, quantized_value.zp_name]
+            dequantize_node = onnx.helper.make_node("DequantizeLinear", dqlinear_inputs, [value_name], dqlinear_name)
+            if dequantize_node not in self.new_nodes:
+                self.new_nodes.append(dequantize_node)
+        else:  # pragma: no cover
+            data_found, scale_name, zp_name, _, _ = self._get_quantization_params(value_name)
+            if self.static:
+                if data_found is False:
+                    raise ValueError(
+                        "Quantization parameters are not specified for param {}."
+                        "In static mode quantization params for inputs and outputs "
+                        "of nodes to be quantized are required.".format(value_name)
+                    )
+            dqlinear_name = value_name + "_DequantizeLinear"
+            dqlinear_inputs = [value_name + "_quantized", scale_name, zp_name]
+            dequantize_node = onnx.helper.make_node("DequantizeLinear", dqlinear_inputs, [value_name], dqlinear_name)
+            if dequantize_node not in self.new_nodes:
+                self.new_nodes.append(dequantize_node)
+
+    def _update_weight(self, weight):
+        """Update weight.
+
+        Given a weight object, update the graph by doing the following:
+         - remove old initializer, update new initializers for
+           quantized weight, zero point, and scale
+         - remove old weight input, update with new inputs for
+           quantized weight, zero point, and scale
+        This function does NOT update the nodes in the graph, just initializers and inputs
+        """
+        if weight.name in self.quantized_value_map:
+            return
+        packed_weight_name = weight.name + "_quantized"
+        scale_name = weight.name + "_scale"
+        zero_point_name = weight.name + "_zero_point"
+
+        # Update packed weight, zero point, and scale initializers
+        packed_weight_np_data = np.asarray(
+            weight.quantized_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(weight.qType)
+        ).reshape(weight.initializer.dims)
+        packed_weight_initializer = onnx.numpy_helper.from_array(packed_weight_np_data, packed_weight_name)
+
+        if not self.add_qdq_pair_to_weight or self.quant_format != "qdq":
+            self.model.initializer().append(packed_weight_initializer)
+        if weight.axis is not None:
+            zero_scale_shape = [weight.initializer.dims[weight.axis]]
+        else:  # scale and zero point must be scalar
+            zero_scale_shape = []
+        zero_point_type = weight.qType
+        scale_initializer = onnx.helper.make_tensor(
+            scale_name, weight.initializer.data_type, zero_scale_shape, weight.scales
+        )
+        zero_initializer = onnx.helper.make_tensor(
+            zero_point_name, zero_point_type, zero_scale_shape, weight.zero_points
+        )
+
+        self.model.initializer().extend([scale_initializer, zero_initializer])
+
+    @staticmethod
+    def tensor_proto_to_array(initializer, base_dir=""):
+        """Convert TensorProto to array."""
+        if quant_utils.is_quantizable_type(initializer.data_type):
+            weights = onnx.numpy_helper.to_array(initializer, base_dir)
+        else:
+            raise ValueError(
+                "Only float type quantization is supported. \
+                Weights {} is {}.".format(
+                    initializer.name,
+                    str(onnx.helper.tensor_dtype_to_np_dtype(initializer.data_type)),
+                )
+            )
+        return weights
+
+    def _get_quantization_params(self, param_name):
+        """Create initializers and inputs in the graph for zero point and scale of output.
+
+        Zero point and scale values are obtained from self.quantization_params if specified.
+
+        Args:
+            param_name (string): Name of the quantization parameter.
+        """
+        if self.quantization_params is None or param_name not in self.quantization_params:
+            return False, "", "", "", ""
+
+        params = self.quantization_params[param_name]
+        if params is None or len(params) != 2:
+            raise ValueError(
+                "Quantization parameters should contain zero point and scale. "
+                "Specified values for output {}: {}".format(param_name, params)
+            )
+
+        zero_point_values = [params[0]]
+        zero_point_shape = []
+        zero_point_name = param_name + "_zero_point"
+        zero_point_type = onnx.helper.np_dtype_to_tensor_dtype(params[0].dtype)
+
+        scale_values = [params[1]]
+        scale_shape = []
+        scale_name = param_name + "_scale"
+        scale_dtype = onnx.helper.np_dtype_to_tensor_dtype(params[1].dtype)
+
+        # Add initializers
+        init_zp = onnx.helper.make_tensor(zero_point_name, zero_point_type, zero_point_shape, zero_point_values)
+        self.model.add_initializer(init_zp)
+        init_scale = onnx.helper.make_tensor(scale_name, scale_dtype, scale_shape, scale_values)
+        self.model.add_initializer(init_scale)
+
+        return True, scale_name, zero_point_name, scale_shape, zero_point_shape
+
+    def _get_quantized_weight(self, initializer, qType, sym):
+        """Get quantized weight."""
+        name = (
+            ("_").join([initializer.name, str(qType)])
+            if self.model.get_initializer_share_num(initializer.name) > 1
+            else initializer.name
+        )
+        if name in self.quantized_value_map:
+            return self.quantized_value_map[name]
+        weights_data = (
+            self.tensor_proto_to_array(initializer, os.path.dirname(self.model.model_path))
+            if self.model.model_path is not None
+            else self.tensor_proto_to_array(initializer)
+        )
+        rmin, rmax, zero_point, scale, quantized_weights_data = quant_utils.quantize_data(
+            weights_data.flatten().tolist(),
+            qType,
+            sym,
+            self.reduce_range,
+        )
+        weight = quant_utils.QuantizedInitializer(
+            name,
+            initializer,
+            [rmin],
+            [rmax],
+            [zero_point],
+            [scale],
+            weights_data,
+            quantized_weights_data,
+            axis=None,
+            qType=qType,
+        )
+
+        return weight
+
+    def is_valid_quantize_weight(self, weight_name):
+        """Check weight can be quantized."""
+        weight = quant_utils.find_by_name(weight_name, self.model.initializer())
+        if weight is not None:
+            return quant_utils.is_quantizable_type(weight.data_type)
+        else:
+            return weight_name in self.quantized_value_map
+
+    def get_bias_add_nodes(self, node, weight_name, last_output, quantized_bias_name):
+        """Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node.
+
+        Args:
+            node (NodeProto): current node (Conv)
+            weight_name (string): weight name
+            last_output (_type_): output of previous node (input to bias add)
+            quantized_bias_name (string): bias name
+        """
+        # Add tensors for the shape to be reshaped to
+        weight = quant_utils.find_by_name(weight_name, self.model.initializer())
+        if weight is None:
+            raise ValueError("Expected {} to be an initializer".format(node.input[1]))
+
+        # Add reshape for correct broadcast
+        reshape_input_data = quantized_bias_name
+        reshape_input_shape = quantized_bias_name + "_reshape_shape"
+        reshape_input = [reshape_input_data, reshape_input_shape]
+        reshape_shape = np.ones((len(weight.dims)), dtype=np.int64)
+        reshape_shape[1] = -1
+        init_shape = onnx.helper.make_tensor(
+            reshape_input_shape, onnx.TensorProto.INT64, [len(weight.dims)], reshape_shape
+        )
+        self.model.add_initializer(init_shape)
+
+        reshape_op_output = node.output[0] + "_reshape"
+        reshape_node = onnx.helper.make_node(
+            "Reshape", reshape_input, [reshape_op_output], quantized_bias_name + "reshape"
+        )
+        self.new_nodes.append(reshape_node)
+
+        # Add an Add operation for bias
+        bias_add_input = [last_output]
+        bias_add_input.append(reshape_op_output)
+        add_node_output = node.output[0] + "_bias_add"
+        add_node = onnx.helper.make_node("Add", bias_add_input, [add_node_output], quantized_bias_name + "bias_add")
+        self.new_nodes.append(add_node)
+        return add_node_output
+
+    def quantize_outputs(self, node, initializer_use_weight_qType=True, direct_int8=False):
+        """Quantize node outputs."""
+        for idx, tensor_name in enumerate(node.output):
+            if (
+                tensor_name in self.value_infos
+                and self.value_infos[tensor_name].type.HasField("tensor_type")
+                and not quant_utils.is_quantizable_type(self.value_infos[tensor_name].type.tensor_type.elem_type)
+            ):
+                return
+            data_found = False
+            refer_name = node.input[0] if direct_int8 else tensor_name
+
+            if refer_name in self.quantized_value_map:
+                scale_name = self.quantized_value_map[refer_name].scale_name
+                zp_name = self.quantized_value_map[refer_name].zp_name
+                data_found = True
+            elif refer_name in self.quantization_params:
+                data_found, scale_name, zp_name, _, _ = self._get_quantization_params(refer_name)
+
+            if data_found is False:
+                raise ValueError(
+                    "Quantization parameters are not specified for param {}."
+                    "In static mode quantization params for inputs and outputs "
+                    "of nodes to be quantized are required.".format(tensor_name)
+                )
+
+            node.output[idx] = tensor_name + "_QuantizeInput"
+            q_input = node.output[idx]
+            q_output = tensor_name + "_quantized"
+            dq_input = q_output
+            dq_output = tensor_name
+            quant_node_name = tensor_name + "_" + node.name + "_QuantizeLinear"
+            dequant_node_name = tensor_name + "_" + node.name + "_DequantizeLinear"
+            qlinear_node = onnx.helper.make_node(
+                "QuantizeLinear",
+                [q_input, scale_name, zp_name],
+                [q_output],
+                quant_node_name,
+            )
+            dequant_node = onnx.helper.make_node(
+                "DequantizeLinear",
+                [dq_input, scale_name, zp_name],
+                [dq_output],
+                dequant_node_name,
+            )
+            self.new_nodes.extend([qlinear_node, dequant_node])
+            for child in self.model.get_children(node):
+                self.replace_input.append([child, tensor_name, dequant_node.output[0]])
+            if tensor_name not in self.quantized_value_map:
+                quantized_value = quant_utils.QuantizedValue(tensor_name, dq_output, scale_name, zp_name)
+                self.quantized_value_map[tensor_name] = quantized_value
+
+    def quantize_inputs(self, node, indices=None, initializer_use_weight_qType=True, direct_int8=False):
+        """Quantize node inputs."""
+        # Quantize the input
+        for idx, tensor_name in enumerate(node.input):
+            if indices and idx not in indices:
+                continue
+            initializer = quant_utils.find_by_name(tensor_name, self.model.initializer())
+            if initializer is not None:
+                if not quant_utils.is_quantizable_type(initializer.data_type):
+                    return
+
+                dtype = (
+                    self.config[node.name]["weight_type"]
+                    if initializer_use_weight_qType
+                    else self.config[node.name]["activation_type"]
+                )
+                sym = (
+                    self.config[node.name]["weight_sym"]
+                    if initializer_use_weight_qType
+                    else self.config[node.name]["activation_sym"]
+                )
+                weight = self._get_quantized_weight(initializer, dtype, sym)
+                self._update_weight(weight)
+                node.input[idx] = weight.name
+                q_weight_name = weight.name + "_quantized"
+                zp_name = weight.name + "_zero_point"
+                scale_name = weight.name + "_scale"
+
+                if self.add_qdq_pair_to_weight and self.quant_format == "qdq":
+                    qlinear_node = onnx.helper.make_node(
+                        "QuantizeLinear",
+                        [tensor_name, scale_name, zp_name],
+                        [weight.name + "_quantized"],
+                        weight.name + "_QuantizeLinear",
+                    )
+                    self.new_nodes.append(qlinear_node)
+
+                dequant_node = onnx.helper.make_node(
+                    "DequantizeLinear",
+                    [q_weight_name, scale_name, zp_name],
+                    [weight.name + "_dequantized"],
+                    weight.name + "_DequantizeLinear",
+                )
+                self.new_nodes.append(dequant_node)
+                self.replace_input.append([node, weight.name, dequant_node.output[0]])
+                if weight.name not in self.quantized_value_map:
+                    quantized_value = quant_utils.QuantizedValue(
+                        weight.name,
+                        q_weight_name,
+                        scale_name,
+                        zp_name,
+                        None,
+                        dtype,
+                    )
+                    self.quantized_value_map[weight.name] = quantized_value
+            else:
+                if (
+                    tensor_name in self.value_infos
+                    and self.value_infos[tensor_name].type.HasField("tensor_type")
+                    and not quant_utils.is_quantizable_type(self.value_infos[tensor_name].type.tensor_type.elem_type)
+                ):
+                    return
+                self._quantize_activation(node, tensor_name, direct_int8)
+
+    def quantize_weights_per_channel(self, node, indices, weight_qType, sym, axis):
+        """Quantize weights per-channel."""
+        if self.opset_version < 13 and self.quant_format == "qdq":
+            self.quantize_inputs(node, indices)
+            return
+
+        for idx, inp in enumerate(node.input):
+            if idx not in indices:
+                continue
+
+            q_name, zp_name, scale_name = self.quantize_weight_per_channel(inp, weight_qType, sym, axis)
+            weight_name = ("_").join([inp, str(weight_qType)]) if self.model.get_initializer_share_num(inp) > 1 else inp
+            dequant_node = onnx.helper.make_node(
+                "DequantizeLinear",
+                [q_name, scale_name, zp_name],
+                [weight_name + "_dequantized"],
+                weight_name + "_DequantizeLinear",
+                axis=axis,
+            )
+            self.new_nodes.append(dequant_node)
+            node.input[idx] = weight_name
+
+            # Replace weight_name with output of DequantizeLinear
+            self.replace_input.append([node, weight_name, dequant_node.output[0]])
+
+            if self.add_qdq_pair_to_weight and self.quant_format == "qdq":
+                qlinear_node = onnx.helper.make_node(
+                    "QuantizeLinear",
+                    [inp, scale_name, zp_name],
+                    [q_name],
+                    weight_name + "_QuantizeLinear",
+                    axis=axis,
+                )
+                self.new_nodes.append(qlinear_node)
+
+
+class StaticQuantizer(Quantizer):
+    """Static quantizer class."""
+
+    def __init__(
+        self,
+        model,
+        q_config,
+        quant_format="qoperator",
+        quantization_params={},
+        op_types_to_quantize=[],
+        fallback_list=["fp32"],
+        reduce_range=None,
+        add_qdq_pair_to_weight=False,
+        optypes_to_exclude_output_quant=[],
+        dedicated_qdq_pair=False,
+        execution_provider="CPUExecutionProvider",
+    ):
+        """Initialization.
+
+        Args:
+            model (ModelProto or ONNXModel): onnx model or onnx model wrapper by neural compressor
+            q_config (dict): op-wise quantization config.
+            static (bool): static or not
+            quantization_params (dict): scale and zero point of tensors
+            op_types_to_quantize (list): optypes to quantize
+            fallback_list (list, optional): fallback data type. Defaults to ['fp32'].
+            reduce_range (bool, optional): use 7 bit or not. Defaults to None.
+            add_qdq_pair_to_weight (bool, optional): add QDQ pair to weight or not. Defaults to False.
+            optypes_to_exclude_output_quant (list, optional): optypes to exclude output quantization. Defaults to [].
+            dedicated_qdq_pair (bool, optional): dedicate QDQ pair or not. Defaults to False.
+            execution_provider (str, optional): execution_provider of onnxrt adaptor. Defaults to CPUExecutionProvider
+        """
+        super().__init__(
+            mode="static_quant",
+            model=model,
+            q_config=q_config,
+            static=True,
+            quantization_params=quantization_params,
+            op_types_to_quantize=op_types_to_quantize,
+        )
+        self.fallback_list = fallback_list
+        self.reduce_range = reduce_range
+        self.add_qdq_pair_to_weight = add_qdq_pair_to_weight
+        self.optypes_to_exclude_output_quant = optypes_to_exclude_output_quant
+        self.dedicated_qdq_pair = dedicated_qdq_pair
+        self.execution_provider = execution_provider
+        self.static = True  # use static quantization for inputs.
+        self.quant_format = quant_format
+        if self.opset_version < 13 and self.quant_format == "qdq":
+            logger.warning(
+                "Per-channel support with QDQ format requires opset version >= 13,"
+                " use per-tensor granularity instead"
+            )
+        if "TensorrtExecutionProvider" in execution_provider:
+
+            # TensorrtExecutionProvider doesn't support Conv + Add fusion
+            self._revert_conv_add_fusion()
+
+            # only quantize Add which is followed by ReduceMean
+            for node in self.model.nodes():
+                if node.op_type == "Add":
+                    children = self.model.get_children(node)
+                    if "ReduceMean" not in [i.op_type for i in children]:
+                        self.config[node.name] = "fp32"
+
+    def _revert_conv_add_fusion(self):
+        add_nodes = []
+        remove_nodes = []
+        for node in self.model.nodes():
+            if node.op_type == "Conv" and len(node.input) == 3:
+                bias_tensor = self.model.get_initializer(node.input[2])
+                bias_array = onnx.numpy_helper.to_array(bias_tensor).reshape((-1, 1, 1))
+                self.model.remove_initializer(bias_tensor)
+                self.model.add_initializer(onnx.numpy_helper.from_array(bias_array, bias_tensor.name))
+                kwargs = {}
+                activation_params = None
+                for attr in node.attribute:
+                    kwargs.update(quant_utils.attribute_to_kwarg(attr))
+                conv = onnx.helper.make_node("Conv", node.input[0:2], [node.name + "_revert"], node.name, **kwargs)
+                add = onnx.helper.make_node("Add", [conv.output[0], node.input[2]], node.output, node.name + "_add")
+                add_nodes.extend([conv, add])
+
+        self.model.remove_nodes(remove_nodes)
+        self.model.add_nodes(add_nodes)
+        self.model.update()
+
+    def _quantize_activation(self, node, tensor_name, direct_int8=False):
+        """Quantize node activation."""
+        if tensor_name in self.quantized_value_map:
+            scale_name = self.quantized_value_map[tensor_name].scale_name
+            zp_name = self.quantized_value_map[tensor_name].zp_name
+            data_found = True
+        else:
+            data_found, scale_name, zp_name, _, _ = self._get_quantization_params(tensor_name)
+
+        if data_found is False:
+            raise ValueError(
+                "Quantization parameters are not specified for param {}."
+                "In static mode quantization params for inputs and outputs "
+                "of nodes to be quantized are required.".format(tensor_name)
+            )
+
+        if direct_int8:
+            # direct int8 models will be quantized only if their inputs are quantized
+            if node.input[0] not in self.quantized_value_map:
+                return
+
+        q_input = tensor_name
+        q_output = (
+            tensor_name + "_" + node.name + "_QuantizeLinear"
+            if tensor_name not in self.model.input()
+            else tensor_name + "_quantized"
+        )
+        dq_input = q_output
+        dq_output = (
+            tensor_name + "_" + node.name + "_dequantized"
+            if tensor_name not in self.model.input()
+            else tensor_name + "_dequantized"
+        )
+        self.replace_input.append([node, tensor_name, dq_output])
+
+        if tensor_name in self.model.input() and tensor_name in self.quantized_value_map:
+            return
+
+        quant_node_name = tensor_name + "_" + node.name + "_QuantizeLinear"
+        dequant_node_name = tensor_name + "_" + node.name + "_DequantizeLinear"
+        qlinear_node = onnx.helper.make_node(
+            "QuantizeLinear",
+            [q_input, scale_name, zp_name],
+            [q_output],
+            quant_node_name,
+        )
+        dequant_node = onnx.helper.make_node(
+            "DequantizeLinear",
+            [dq_input, scale_name, zp_name],
+            [dq_output],
+            dequant_node_name,
+        )
+        self.new_nodes.extend([qlinear_node, dequant_node])
+
+        if tensor_name not in self.quantized_value_map:
+            quantized_value = quant_utils.QuantizedValue(
+                tensor_name,
+                dq_output,
+                scale_name,
+                zp_name,
+            )
+            self.quantized_value_map[tensor_name] = quantized_value
+
+
+class DynamicQuantizer(Quantizer):
+    """Dynamic quantizer class."""
+
+    def __init__(
+        self,
+        model,
+        q_config,
+        quantization_params={},
+        op_types_to_quantize=[],
+        fallback_list=["fp32"],
+        reduce_range=None,
+        execution_provider="CPUExecutionProvider",
+    ):
+        """Initialization.
+
+        Args:
+            model (ModelProto or onnx_model.ONNXModel): onnx model or onnx model wrapper by neural compressor
+            q_config (dict): op-wise quantization config.
+            quantization_params (dict): scale and zero point of tensors
+            op_types_to_quantize (list): optypes to quantize
+            fallback_list (list, optional): fallback data type. Defaults to ['fp32'].
+            reduce_range (bool, optional): use 7 bit or not. Defaults to None.
+            add_qdq_pair_to_weight (bool, optional): add QDQ pair to weight or not. Defaults to False.
+            dedicated_qdq_pair (bool, optional): dedicate QDQ pair or not. Defaults to False.
+            execution_provider (str, optional): execution_provider of onnxrt adaptor. Defaults to CPUExecutionProvider
+        """
+        super().__init__(
+            mode="dynamic_quant",
+            model=model,
+            q_config=q_config,
+            static=False,
+            quantization_params=quantization_params,
+            op_types_to_quantize=op_types_to_quantize,
+        )
+
+    def _quantize_activation(self, node, tensor_name, direct_int8=False):
+        """Quantize node activation."""
+        qlinear_node = None
+        if quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.model.nodes()) is not None:
+            qlinear_node = quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.model.nodes())
+        elif quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.new_nodes) is not None:
+            qlinear_node = quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.new_nodes)
+        if qlinear_node is None:
+            if (
+                self.fuse_dynamic_quant
+                and self.config[node.name]["activation_type"] == onnx.TensorProto.UINT8
+                and not self.config[node.name]["activation_sym"]
+            ):
+                # DynamicQuantizeLinear supports uint8 input for CPU EP, supports uint8 and int8 for DML EP
+                scale_name = tensor_name + "_scale"
+                zp_name = tensor_name + "_zero_point"
+                if quant_utils.find_by_name(scale_name, self.model.initializer()):
+                    self.model.remove_initializer(quant_utils.find_by_name(scale_name, self.model.initializer()))
+                if quant_utils.find_by_name(zp_name, self.model.initializer()):
+                    self.model.remove_initializer(quant_utils.find_by_name(zp_name, self.model.initializer()))
+                qlinear_node = onnx.helper.make_node(
+                    "DynamicQuantizeLinear",
+                    [tensor_name],
+                    [tensor_name + "_dynamic_quantized", scale_name, zp_name],
+                    tensor_name + "_QuantizeLinear",
+                )
+            else:
+                scale_name, zp_name, _, _ = self._get_dynamic_input_quantization_params(
+                    tensor_name, self.config[node.name]["activation_type"]
+                )
+                qlinear_node = onnx.helper.make_node(
+                    "QuantizeLinear",
+                    [tensor_name, scale_name, zp_name],
+                    [tensor_name + "_quantized"],
+                    tensor_name + "_QuantizeLinear",
+                )
+            if qlinear_node not in self.new_nodes:
+                self.new_nodes.append(qlinear_node)
+            self.quantized_value_map[tensor_name] = quant_utils.QuantizedValue(
+                tensor_name,
+                qlinear_node.output[0],
+                scale_name,
+                zp_name,
+                self.config[node.name]["activation_type"],
+            )
+        self.replace_input.append([node, tensor_name, qlinear_node.output[0]])
+
+    def _get_dynamic_input_quantization_params(self, input_name, qType):
+        """Create nodes for dynamic quantization of input.
+
+        Args:
+            input_name (string): Name of the input.
+            qType (int): type to quantize to.
+        """
+        if qType == onnx.TensorProto.INT8:
+            return self._get_dynamic_input_quantization_params_int8(input_name)
+
+        return self._get_dynamic_input_quantization_params_uint8(input_name)
+
+    def _get_dynamic_input_quantization_params_int8(self, input_name):  # pragma: no cover
+        """Create nodes for dynamic quantization of input to int8.
+
+        Args:
+            input_name (string): Name of the input.
+        """
+        qType = onnx.TensorProto.INT8
+
+        # Reduce min and Reduce max
+        input_scale_name = input_name + "_scale"
+
+        reduce_min_name = input_name + "_ReduceMin"
+        reduce_min_node = onnx.helper.make_node(
+            "ReduceMin",
+            [input_name],
+            [reduce_min_name + ":0"],
+            reduce_min_name,
+            keepdims=0,
+        )
+        self.new_nodes.append(reduce_min_node)
+
+        reduce_max_name = input_name + "_ReduceMax"
+        reduce_max_node = onnx.helper.make_node(
+            "ReduceMax",
+            [input_name],
+            [reduce_max_name + ":0"],
+            reduce_max_name,
+            keepdims=0,
+        )
+        self.new_nodes.append(reduce_max_node)
+
+        # Compute scale
+        #   Find abs(rmin)
+        reduce_min_abs_name = reduce_min_name + "_Abs"
+        reduce_min_abs_node = onnx.helper.make_node(
+            "Abs",
+            [reduce_min_node.output[0]],
+            [reduce_min_abs_name + ":0"],
+            reduce_min_abs_name,
+        )
+        self.new_nodes.append(reduce_min_abs_node)
+        #   Find abs(rmax)
+        reduce_max_abs_name = reduce_max_name + "_Abs"
+        reduce_max_abs_node = onnx.helper.make_node(
+            "Abs",
+            [reduce_max_node.output[0]],
+            [reduce_max_abs_name + ":0"],
+            reduce_max_abs_name,
+        )
+        self.new_nodes.append(reduce_max_abs_node)
+        #   Compute max of abs(rmin) and abs(rmax)
+        abs_max_name = input_name + "_Abs_Max"
+        abs_max_node = onnx.helper.make_node(
+            "Max",
+            [reduce_min_abs_node.output[0], reduce_max_abs_node.output[0]],
+            [abs_max_name + ":0"],
+            abs_max_name,
+        )
+        self.new_nodes.append(abs_max_node)
+        #   and divide by (quantize_range/2.0) which will be equal to max(...)*2.0/quantize_range
+        qmin, qmax = quant_utils.get_qmin_qmax_for_qType(qType, self.reduce_range)
+        initializer_div = onnx.helper.make_tensor(
+            self.fixed_qrange_int8_name,
+            onnx.TensorProto.FLOAT,
+            [],
+            [(qmax - qmin) / 2.0],
+        )
+        self.model.add_initializer(initializer_div)
+        scale_div_name = input_name + "scale_Div"
+        scale_div_node = onnx.helper.make_node(
+            "Div",
+            [abs_max_node.output[0], self.fixed_qrange_int8_name],
+            [input_scale_name],
+            scale_div_name,
+        )
+        self.new_nodes.append(scale_div_node)
+
+        # Zero point
+        initializer_zp = onnx.helper.make_tensor(self.fixed_zero_zp_name, qType, [], [0])
+        self.model.add_initializer(initializer_zp)
+
+        return input_scale_name, self.fixed_zero_zp_name, [], []
+
+    def _get_dynamic_input_quantization_params_uint8(self, input_name):
+        """Create nodes for dynamic quantization of input to uint8.
+
+        Args:
+            input_name (string): Name of the input.
+        """
+        qType = onnx.TensorProto.UINT8
+        # Reduce min and Reduce max
+        input_scale_name = input_name + "_scale"
+        input_zp_name = input_name + "_zero_point"
+
+        reduce_min_name = input_name + "_ReduceMin"
+        reduce_min_node = onnx.helper.make_node(
+            "ReduceMin",
+            [input_name],
+            [reduce_min_name + ":0"],
+            reduce_min_name,
+            keepdims=0,
+        )
+        self.new_nodes.append(reduce_min_node)
+
+        reduce_max_name = input_name + "_ReduceMax"
+        reduce_max_node = onnx.helper.make_node(
+            "ReduceMax",
+            [input_name],
+            [reduce_max_name + ":0"],
+            reduce_max_name,
+            keepdims=0,
+        )
+        self.new_nodes.append(reduce_max_node)
+
+        # Add tensors for quantize range and zero value.
+        qmin, qmax = quant_utils.get_qmin_qmax_for_qType(qType, self.reduce_range)
+        initializer_qrange = onnx.helper.make_tensor(
+            self.fixed_qrange_uint8_name,
+            onnx.TensorProto.FLOAT,
+            [],
+            [qmax - qmin],
+        )
+        self.model.add_initializer(initializer_qrange)
+        initializer_qvalue = onnx.helper.make_tensor(self.fixed_zero_name, onnx.TensorProto.FLOAT, [], [0.0])
+        self.model.add_initializer(initializer_qvalue)
+
+        # Compute Scale
+        #   Subtract rmax and rmin
+        scale_sub_name = input_name + "_scale_Sub"
+        scale_sub_node = onnx.helper.make_node(
+            "Sub",
+            [reduce_max_node.output[0], reduce_min_node.output[0]],
+            [scale_sub_name + ":0"],
+            scale_sub_name,
+        )
+        self.new_nodes.append(scale_sub_node)
+        #   and divide by quantize range
+        scale_div_name = input_name + "_scale_Div"
+        scale_div_node = onnx.helper.make_node(
+            "Div",
+            [scale_sub_node.output[0], self.fixed_qrange_uint8_name],
+            [input_scale_name],
+            scale_div_name,
+        )
+        self.new_nodes.append(scale_div_node)
+
+        # Compute zero point
+        #   Subtract zero and rmin
+        zp_sub_name = input_name + "_zero_point_Sub"
+        zp_sub_node = onnx.helper.make_node(
+            "Sub",
+            [self.fixed_zero_name, reduce_min_node.output[0]],
+            [zp_sub_name + ":0"],
+            zp_sub_name,
+        )
+        self.new_nodes.append(zp_sub_node)
+        #   Divide by scale
+        zp_div_name = input_name + "_zero_point_Div"
+        zp_div_node = onnx.helper.make_node(
+            "Div",
+            [zp_sub_node.output[0], input_scale_name],
+            [zp_div_name + ":0"],
+            zp_div_name,
+        )
+        self.new_nodes.append(zp_div_node)
+        #   Compute floor
+        zp_floor_name = input_name + "_zero_point_Floor"
+        zp_floor_node = onnx.helper.make_node("Floor", zp_div_node.output, [zp_floor_name + ":0"], zp_floor_name)
+        self.new_nodes.append(zp_floor_node)
+        #   Cast to integer
+        zp_cast_name = input_name + "_zero_point_Cast"
+        zp_cast_node = onnx.helper.make_node("Cast", zp_floor_node.output, [input_zp_name], zp_cast_name, to=qType)
+        self.new_nodes.append(zp_cast_node)
+
+        return input_scale_name, input_zp_name, [], []
diff --git a/onnx_neural_compressor/algorithms/smoother/core.py b/onnx_neural_compressor/algorithms/smoother/core.py
index d21641482..bcf830f1a 100644
--- a/onnx_neural_compressor/algorithms/smoother/core.py
+++ b/onnx_neural_compressor/algorithms/smoother/core.py
@@ -22,22 +22,12 @@
 import onnxruntime as ort
 
 from onnx_neural_compressor import data_reader, logger, onnx_model, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
 from onnx_neural_compressor.algorithms.smoother import calibrator
 
 from typing import List, Union  # isort: skip
 
 
-_dtype_map = {
-    np.dtype("float32"): 1,
-    np.dtype("uint8"): 2,
-    np.dtype("int8"): 3,
-    np.dtype("int32"): 6,
-    np.dtype("int64"): 7,
-    np.dtype("float16"): 10,
-    np.dtype("double"): 11,
-}
-
-
 def _get_quant_dequant_output(model, input_data, output_data, providers):
     """Get loss between fp32 output and QDQ output.
 
@@ -47,7 +37,7 @@ def _get_quant_dequant_output(model, input_data, output_data, providers):
         output_data (numpy.ndarray): fp32 output
         providers (list): execution provider
     """
-    input_data = _quant_dequant_data(input_data, 2, "asym")
+    input_data = quant_utils.qdq_data(input_data, 2, False)
     sess = ort.InferenceSession(model.SerializeToString(), providers=providers)
     preds = sess.run(None, {model.graph.input[0].name: input_data})
     loss = np.sum(np.abs(output_data - preds) ** 2)
@@ -65,28 +55,22 @@ def _make_sub_graph(node, inits, input_data, output_data, opset, ir_version):
         opset (object): opset of the model
         ir_version (object): ir_version of the model
     """
-    input = onnx.helper.make_tensor_value_info(node.input[0], _dtype_map[input_data.dtype], input_data.shape)
-    output = onnx.helper.make_tensor_value_info(node.output[0], _dtype_map[output_data.dtype], output_data.shape)
+    input = onnx.helper.make_tensor_value_info(
+        node.input[0],
+        onnx.helper.np_dtype_to_tensor_dtype(input_data.dtype),
+        input_data.shape,
+    )
+    output = onnx.helper.make_tensor_value_info(
+        node.output[0],
+        onnx.helper.np_dtype_to_tensor_dtype(output_data.dtype),
+        output_data.shape,
+    )
     graph = onnx.helper.make_graph([node], "sub_graph", [input], [output], inits)
     model = onnx.helper.make_model(graph, opset_imports=opset)
     model.ir_version = ir_version
     return model
 
 
-def _quant_dequant_data(data, qType=3, scheme="sym"):
-    """Quantize and then dequantize data.
-
-    Args:
-        data (numpy.ndarray): target data
-        qType (int): data type
-        scheme (str): sym or asym quantization
-    """
-    rmin, rmax, zero_point, scale, quantized_data = utility.quantize_data(
-        data.flatten().tolist(), utility.get_qrange_for_qType(qType, False), qType, scheme
-    )
-    return ((quantized_data - zero_point) * scale).astype(data.dtype).reshape(data.shape)
-
-
 class Smoother:
     """Fake input channel quantization.
 
@@ -102,7 +86,7 @@ def __init__(
         self,
         model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
         dataloader: data_reader.CalibrationDataReader,
-        providers: List[str] = ["CPUExecutionProvider"],
+        execution_provider: str = "CPUExecutionProvider",
     ):
         """Initialize the attributes of class."""
         self.model = (
@@ -112,7 +96,7 @@ def __init__(
         self.value_infos.update({ot.name: ot for ot in self.model.model.graph.output})
         self.value_infos.update({it.name: it for it in self.model.model.graph.input})
         self.dataloader = dataloader
-        self.providers = providers
+        self.providers = [execution_provider]
         self.tensor_scales_info = {}
         self.new_added_mul_nodes = []
         self.new_added_value_info = []
@@ -204,7 +188,7 @@ def _dump_op_info(self, percentile, op_types, iterations):
             self.model,
             self.dataloader,
             iterations=list(range(0, iterations)),
-            backend=self.providers,
+            execution_provider=self.providers,
         )
 
         self.max_vals_per_channel, self.shape_info, self.tensors_to_node = sq_calibrator.calib_smooth(
@@ -382,7 +366,7 @@ def _get_output_loss(self, node_name, scale, calib_iter):
             )
             base_dir = "" if not self.model.is_large_model else os.path.dirname(self.model.model_path)
             weight = onnx.numpy_helper.to_array(self.model.get_initializer(node.input[1]), base_dir)
-            weight_q = _quant_dequant_data(weight)
+            weight_q = quant_utils.qdq_data(weight, 3, True)
 
             self.model.set_initializer(node.input[1], weight_q)
             inits = [self.model.get_initializer(i) for i in node.input if self.model.get_initializer(i) is not None]
@@ -468,7 +452,7 @@ def _auto_tune_alpha(
                     self._adjust_weights(scale)
                     input_scale = (
                         self._reshape_scale_for_input(tensor_name, key)
-                        if not (node.op_type == "Gemm" and utility.is_B_transposed(node))
+                        if not (node.op_type == "Gemm" and quant_utils.is_B_transposed(node))
                         else self.tensor_scales_info[key]
                     )
                     loss = self._get_output_loss(node_info[0], input_scale, calib_iter)
@@ -505,7 +489,6 @@ def _get_smooth_scales(self, alpha, target_list=[]):
         Returns:
             the smooth scales for weights, currently one input tensor only have one scale
         """
-        logger.info("Start smooth scales collection.")
         scales = {}
         for tensor, nodes in self.tensors_to_node.items():
             # if scales_per_op the key of scales is the node name, otherwise the activation of node
@@ -519,7 +502,7 @@ def _get_smooth_scales(self, alpha, target_list=[]):
                         base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "",
                     )
                     if (len(weight.shape) == 4 and weight.shape[1] != 1) or (
-                        node.op_type == "Gemm" and utility.is_B_transposed(node)
+                        node.op_type == "Gemm" and quant_utils.is_B_transposed(node)
                     ):
                         weight = np.moveaxis(weight, 0, 1)
                     specific_alpha = alpha[node_info[0]] if isinstance(alpha, dict) else alpha
@@ -535,7 +518,7 @@ def _get_smooth_scales(self, alpha, target_list=[]):
                         base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "",
                     )
                     if (len(weight.shape) == 4 and weight.shape[1] != 1) or (
-                        node.op_type == "Gemm" and utility.is_B_transposed(node)
+                        node.op_type == "Gemm" and quant_utils.is_B_transposed(node)
                     ):
                         weight = np.moveaxis(weight, 0, 1)
                     weight = weight.reshape(weight.shape[0], -1)
@@ -588,7 +571,7 @@ def _insert_smooth_mul_op(self, scales):
             name = key + "_" + "smooth_scale"
             scale_tensor = onnx.helper.make_tensor(
                 name=key + "_" + "smooth_scale",
-                data_type=onnx.onnx_pb.TensorProto.FLOAT,
+                data_type=onnx.TensorProto.FLOAT,
                 dims=scale_factor.shape,
                 vals=scale_factor.flatten().tolist(),
             )
@@ -632,7 +615,7 @@ def _adjust_weights(self, scales):
                 if len(weight.shape) == 2:
                     scale = (
                         np.expand_dims(scales[key], axis=0)
-                        if node.op_type == "Gemm" and utility.is_B_transposed(node)
+                        if node.op_type == "Gemm" and quant_utils.is_B_transposed(node)
                         else np.expand_dims(scales[key], axis=-1)
                     )
                     new_weight = weight * scale
diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py
new file mode 100644
index 000000000..d802dc04d
--- /dev/null
+++ b/onnx_neural_compressor/algorithms/utility.py
@@ -0,0 +1,702 @@
+# Copyright (c) 2023 MIT HAN Lab
+# This source code is licensed under the MIT license
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import struct
+import sys
+from importlib import util
+
+import numpy as np
+from packaging import version
+
+from onnx_neural_compressor import constants, utility
+
+if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"):  # pragma: no cover
+    import onnxruntime_extensions
+
+onnx = utility.LazyImport("onnx")
+ort = utility.LazyImport("onnxruntime")
+
+__producer__ = "onnx.quantize"
+__version__ = "0.1.0"
+onnx_domain = "ai.onnx"
+ms_domain = "com.microsoft"
+QUANT_OP_NAME_SUFFIX = "_quant"
+
+
+def attribute_to_kwarg(attribute):
+    """Convert attribute to kwarg format for use with onnx.helper.make_node."""
+    attribute_mapping = {
+        1: attribute.f,
+        2: attribute.i,
+        3: attribute.s,
+        4: attribute.t,
+        5: attribute.g,
+        6: attribute.floats,
+        7: attribute.ints,
+        8: attribute.strings,
+        9: attribute.tensors,
+        10: attribute.graphs,
+    }
+    if attribute.type in attribute_mapping:
+        value = attribute_mapping[attribute.type]
+    else:  # pragma: no cover
+        raise ValueError(
+            "attribute {} has no type specified " "or unsupported type {}.".format(attribute.name, attribute.type)
+        )
+    return {attribute.name: value}
+
+
+ONNX_INT_TYPE_RANGE = {
+    onnx.TensorProto.UINT8: (0, 255),
+    onnx.TensorProto.INT8: (-128, 127),
+}
+
+ONNX_INT_TYPE_SYMMETRIC_RANGE = {
+    onnx.TensorProto.INT8: (-127, 127),
+}
+
+ONNX_INT_TYPE_REDUCED_RANGE = {
+    onnx.TensorProto.UINT8: (0, 127),
+    onnx.TensorProto.INT8: (-64, 64),
+}
+
+ONNX_STR_TYPE_RANGE = {
+    "int1": (-1, 0),
+    "int2": (-2, 1),
+    "int3": (-4, 3),
+    "int4": (-8, 7),  # onnx >= 1.16.0 defines TensorProto.INT4
+    "int5": (-16, 15),
+    "int6": (-32, 31),
+    "int7": (-64, 63),
+    "int8": (-128, 127),
+    "uint1": (0, 1),
+    "uint2": (0, 3),
+    "uint3": (0, 7),
+    "uint4": (0, 15),  # onnx >= 1.16.0 defines TensorProto.UINT4
+    "uint5": (0, 31),
+    "uint6": (0, 63),
+    "uint7": (0, 127),
+    "uint8": (0, 255),
+}
+
+
+def _qType_to_np_type(qType):
+    if isinstance(qType, int):
+        return onnx.helper.tensor_dtype_to_np_dtype(qType)
+    elif isinstance(qType, str) and "uint" in qType:
+        return np.dtype("uint8")
+    else:
+        return np.dtype("int8")
+
+
+def find_by_name(name, item_list):
+    """Helper function to find item by name in a list."""
+    items = []
+    for item in item_list:
+        assert hasattr(item, "name"), "{} should have a 'name' attribute defined".format(item)
+        if item.name == name:
+            items.append(item)
+    if len(items) > 0:
+        return items[0]
+    else:
+        return None
+
+
+def get_qmin_qmax_for_qType(qType, reduce_range=False, sym=False):  # noqa: N802
+    """Get qmin, qmax for qType.
+
+    Args:
+        qType (int or str): int for onnx defined type, str for onnx not defined type
+        reduce_range (bool, optional): whether use 7 bit for 8bit quantization
+        sym (bool, optional): quantization scheme. Defaults to False.
+    """
+    if qType == onnx.TensorProto.FLOAT8E4M3FN:
+        raise NotImplementedError("This function is not implemented for float 8 as not needed.")
+
+    qrange = None
+
+    if isinstance(qType, str):
+        qrange = ONNX_STR_TYPE_RANGE.get(qType)
+    elif reduce_range:
+        qrange = ONNX_INT_TYPE_REDUCED_RANGE.get(qType)
+    elif sym and qType in ONNX_INT_TYPE_SYMMETRIC_RANGE:
+        qrange = ONNX_INT_TYPE_SYMMETRIC_RANGE[qType]
+    else:
+        qrange = ONNX_INT_TYPE_RANGE.get(qType)
+
+    if not qrange:
+        raise ValueError(f"Unexpected data type {qType} requested.")
+
+    return qrange
+
+
+def quantize_nparray(dtype, arr, scale, zero_point, low=None, high=None):
+    """Quantize numpy array."""
+    q_weight = np.empty_like(np.asarray(arr), dtype=np.asarray(scale).dtype)
+    np.divide(arr, scale, out=q_weight)
+    np.add(q_weight, zero_point, out=q_weight)
+    np.round(q_weight, out=q_weight)
+    if low is not None and high is not None:
+        np.clip(q_weight, low, high, out=q_weight)
+    return q_weight.astype(dtype)
+
+
+def quantize_data_per_channel(data, axis, qType, sym, reduce_range=False):
+    """Quantize tensor per-channel."""
+    quantize_range = get_qmin_qmax_for_qType(qType, reduce_range, sym)
+    rmin = None
+    rmax = None
+    for i in range(len(data.shape)):
+        if i != axis:
+            rmin = np.min(data, axis=i, keepdims=True) if rmin is None else np.min(rmin, axis=i, keepdims=True)
+            rmax = np.max(data, axis=i, keepdims=True) if rmax is None else np.max(rmax, axis=i, keepdims=True)
+    rmin = np.minimum(rmin, 0)
+    rmax = np.maximum(rmax, 0)
+    scale, zero_point = calculate_scale_zp(rmin, rmax, qType, sym, reduce_range)
+
+    dtype = _qType_to_np_type(qType)
+    quantized_data = quantize_nparray(dtype, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1])
+    return rmin.reshape(-1, 1), rmax.reshape(-1, 1), zero_point.reshape(-1, 1), scale.reshape(-1, 1), quantized_data
+
+
+def dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value):  # pragma: no cover
+    """Dequantize tensor with scale and zero point."""
+    return (tensor_value.astype(scale_value.dtype) - zo_value.astype(scale_value.dtype)) * scale_value
+
+
+def dequantize_data(tensor_value, scale_value, zo_value, axis=0):  # pragma: no cover
+    """Dequantize tensor."""
+    if not isinstance(scale_value, np.ndarray):
+        return dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value)
+    else:
+        channel_count = tensor_value.shape[axis]  # TBD, default from axis 0
+        new_per_channel_tensor_values = []
+        for i in range(channel_count):
+            per_channel_tensor_value = tensor_value.take(i, axis)
+            per_channel_scale_value = scale_value.take(i)
+            per_channel_zero_value = zo_value.take(i)
+            new_per_channel_tensor_values.append(
+                dequantize_data_with_scale_zero(
+                    per_channel_tensor_value, per_channel_scale_value, per_channel_zero_value
+                )
+            )
+        # combine per_channel_data into one
+        reshape_dims = list(tensor_value.shape)  # deep copy
+        reshape_dims[axis] = 1  # only one per channel for reshape
+        new_tensor_value = new_per_channel_tensor_values[0].reshape(reshape_dims)
+        for i in range(1, channel_count):
+            new_per_channel_tensor_value = new_per_channel_tensor_values[i].reshape(reshape_dims)
+            new_tensor_value = np.concatenate((new_tensor_value, new_per_channel_tensor_value), axis)
+        return new_tensor_value
+
+
+def calculate_scale_zp(rmin, rmax, qType, sym, reduce_range=False):
+    """Calculate scale and zero point."""
+    qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, sym)
+    dtype = _qType_to_np_type(qType)
+    if isinstance(rmax, np.ndarray):
+        if sym:
+            max_range = np.maximum(abs(rmin), abs(rmax))
+            rmin = -max_range
+            rmax = max_range
+        scale = (rmax - rmin) / (qmax - qmin)
+        scale[scale < np.finfo(rmax.dtype).tiny] = 1
+        zero_point = (
+            np.multiply(np.ones(rmax.shape), np.round((qmax + qmin) / 2.0)).astype(dtype)
+            if sym
+            else np.round(qmin - rmin / scale).astype(dtype)
+        )
+    else:
+        if sym:
+            max_range = max(abs(rmin), abs(rmax))
+            scale = (float(max_range) * 2) / (qmax - qmin) if max_range > 0 else 1
+        else:
+            scale = (float(rmax) - float(rmin)) / (qmax - qmin) if rmin != rmax else 1
+        zero_point = np.round((qmax + qmin) / 2.0).astype(dtype) if sym else np.round(qmin - rmin / scale).astype(dtype)
+    return np.float32(scale), zero_point
+
+
+def quantize_data(data, qType, sym, reduce_range=False, ratio=1.0, axis=None):
+    """Quantize data.
+
+    To pack weights, we compute a linear transformation
+        - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and
+        - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where
+            m = max(abs(rmin), abs(rmax))
+    and add necessary intermediate nodes to transform quantized weight to full weight
+    using the equation r = S(q-z), where
+        r: real original value
+        q: quantized value
+        S: scale
+        z: zero point
+
+    Args:
+        data (array): data to quantize
+        qType (int): data type to quantize to. Supported types UINT8 and INT8
+        sym (bool): whether use sym quantization.
+        reduce_range (bool): whether use 7 bit or not. Defaults to False
+        ratio (float, optional): percentile of clip. Defaults to 1.0
+        axis (int, optional): process data along a specific axis. Default is None (process the whole data)
+    """
+    quantize_range = get_qmin_qmax_for_qType(qType, reduce_range, sym)
+    rmin = np.min(np.min(data), 0) if axis is None else np.min(data, axis=1, keepdims=True)
+    rmax = np.max(np.max(data), 0) if axis is None else np.max(data, axis=1, keepdims=True)
+    rmin *= ratio
+    rmax *= ratio
+
+    scale, zero_point = calculate_scale_zp(rmin, rmax, qType, sym, reduce_range)
+    dtype = _qType_to_np_type(qType)
+    quantized_data = quantize_nparray(dtype, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1])
+    return rmin, rmax, zero_point, scale, quantized_data
+
+
+def qdq_data(data, qType, sym, reduce_range=False, ratio=1.0, axis=None):
+    _, _, zero_point, scale, quantized_data = quantize_data(data, qType, sym, reduce_range, ratio, axis)
+    return scale * (quantized_data - zero_point)
+
+
+def is_B_transposed(node):
+    """Whether inuput B is transposed."""
+    transB = [attr for attr in node.attribute if attr.name == "transB"]
+    if len(transB):
+        return 0 < onnx.helper.get_attribute_value(transB[0])
+    return False
+
+
+def is_quantizable_type(data_type):
+    return data_type in [onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16, onnx.TensorProto.BFLOAT16]
+
+
+def _get_blob_size(group_size, has_zp):  # pragma: no cover
+    """Get blob_size.
+
+    Args:
+        group_size (int): how many elements share one scale/zp
+        has_zp (bool): whether zero_point is None
+    """
+    if version.Version(ort.__version__) > constants.ONNXRT1161_VERSION:
+        blob_size = group_size // 2
+    elif has_zp:
+        blob_size = group_size // 2 + 4 + 1
+    else:
+        blob_size = group_size // 2 + 4
+    return blob_size
+
+
+def make_matmul_weight_only_node(
+    node: onnx.NodeProto,
+    weight_shape: tuple,
+    num_bits: int,
+    group_size: int,
+    k_blocks: int,
+    q_weight: np.array,
+    scale: np.array,
+    zero_point: np.array,
+    accuracy_level: int = 0,
+):
+    """Build MatMulFpQ4/MatMulNBits node.
+
+    Args:
+        node (onnx.NodeProto): original matmul node
+        weight_shape (tuple): original weight shape
+        num_bits (int): number of bits used to represent weights.
+        group_size (int): how many elements share one scale/zp
+        k_blocks (int): block number
+        q_weight (np.array): quantized weight
+        scale (np.array): scale
+        zero_point (np.array): zero point
+        accuracy_level (int, optional): accuracy level.
+            Support 0 (unset), 1(fp32 compute type of jblas kernel),
+            2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
+            4 (int8 compute type of jblas kernel) Defaults to 0.
+
+    Returns:
+        matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node
+        new_inits: initializers of the new node
+    """
+    blob_size = _get_blob_size(group_size, zero_point is not None)
+    packed = np.zeros((q_weight.shape[0], blob_size), dtype="uint8")
+    q_weight_name = node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size))
+    input_names = [node.input[0], q_weight_name]
+    new_inits = []
+    kwargs = {}
+
+    if version.Version(ort.__version__) > constants.ONNXRT1161_VERSION:
+        op_type = "MatMulNBits"
+
+        # pack quantized weight
+        q_weight_pairs = q_weight[:, ::2] | q_weight[:, 1::2] << 4
+        packed[:, :] = q_weight_pairs[:, :blob_size]
+        packed = np.reshape(packed, (-1, k_blocks, blob_size))
+
+        # build scale tensor
+        scale = np.reshape(scale, (-1, k_blocks))
+        scale_tensor = onnx.helper.make_tensor(
+            name=node.input[1] + "_scale",
+            data_type=onnx.helper.np_dtype_to_tensor_dtype(scale.dtype),
+            dims=scale.shape,
+            vals=scale.tobytes(),
+            raw=True,
+        )
+        input_names.append(scale_tensor.name)
+        new_inits.append(scale_tensor)
+
+        # build zero_point tensor
+        if zero_point is not None:
+            if num_bits > 4:
+                packed_zp = np.reshape(zero_point, (1, -1)).astype("uint8")
+            else:
+                packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8")
+                # create an index array
+                idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1)
+                # separate odd and even indices
+                even_idx = idx[::2]
+                odd_idx = idx[1::2]
+                # vectorized operation for even and odd indices
+                packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel()
+                packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4)
+
+            zp_tensor = onnx.helper.make_tensor(
+                name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True
+            )
+            input_names.append(zp_tensor.name)
+            new_inits.append(zp_tensor)
+
+        # set kwargs
+        kwargs["K"] = weight_shape[0]
+        kwargs["N"] = weight_shape[1]
+        kwargs["bits"] = num_bits
+        kwargs["block_size"] = group_size
+        if accuracy_level > 0:
+            # require onnxruntime > 1.16.3
+            kwargs["accuracy_level"] = accuracy_level
+
+    else:
+        offset = 5 if zero_point is not None else 4
+        op_type = "MatMulFpQ4"
+
+        # pack quantized weight
+        for i in range(q_weight.shape[0]):
+            bf = struct.pack("f", scale[i])
+            packed[i][0] = bf[0]
+            packed[i][1] = bf[1]
+            packed[i][2] = bf[2]
+            packed[i][3] = bf[3]
+
+            if zero_point is not None:
+                packed[i][4] = zero_point[i]
+
+            packed[i][offset:] = np.bitwise_or(
+                q_weight[i][: group_size // 2], np.left_shift(q_weight[i][group_size // 2 :], num_bits)
+            )
+        packed = packed.reshape(-1)
+
+        # build shape tensor
+        shape_tensor = onnx.helper.make_tensor(
+            name=node.input[1] + "_shape", data_type=7, dims=(2,), vals=np.array(weight_shape, dtype="int64")
+        )
+        new_inits.append(shape_tensor)
+        input_names.append(shape_tensor.name)
+
+        # set kwargs
+        kwargs["blk_quant_type"] = 1 if zero_point is not None else 0
+
+    q_weight_tensor = onnx.helper.make_tensor(
+        name=q_weight_name,
+        data_type=2,
+        dims=packed.shape,
+        vals=packed.tobytes(),
+        raw=True,
+    )
+    new_inits.append(q_weight_tensor)
+
+    matmul_weight_only_node = onnx.helper.make_node(
+        op_type,
+        inputs=input_names,
+        outputs=node.output,
+        name=node.name + "_Q" + str(num_bits) if node.name else "_Q" + str(num_bits),
+        domain="com.microsoft",
+        **kwargs,
+    )
+    return matmul_weight_only_node, new_inits
+
+
+def prepare_inputs(model, data_reader, providers):
+    """Prepare inputs for weight only quantization.
+
+    Args:
+        model (ModelProto or onnx_model.ONNXModel): onnx model.
+        data_reader (CalibrationDataReader): a calibration data reader.
+        providers (list): providers to use.
+
+    Returns:
+        inputs: prepared inputs.
+        so: session options
+    """
+
+    so = ort.SessionOptions()
+    if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"):  # pragma: no cover
+        so.register_custom_ops_library(onnxruntime_extensions.get_library_path())
+    if model.is_large_model:
+        onnx.save_model(
+            model.model,
+            model.model_path + "_augment.onnx",
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            convert_attribute=False,
+        )
+
+    inputs_list = []
+    while True:
+        inputs = data_reader.get_next()
+        if not inputs:
+            break
+        inputs_list.append(inputs)
+    return inputs_list, so
+
+
+def pad_tensor(weight, group_size, k_blocks):
+    """Pad tensor rowi so that it can be is divisible by group_size.
+
+    Args:
+        weight (array): weight
+        group_size (int): how many elements share one scale/zp
+        k_blocks (int): the number of block
+
+    Returns:
+        weight: paded weight
+    """
+    if group_size == -1:
+        return weight
+
+    org_w_shape = weight.shape
+    padded_rows = k_blocks * group_size
+    pad_len = padded_rows - org_w_shape[0]
+
+    if pad_len > 0:
+        weight = np.pad(weight, ((0, pad_len), (0, 0)), "constant")
+
+    return weight
+
+
+def dump_woq_stats(model, quantize_config):
+    res = {}
+
+    dtype_set = set()
+    for node in model.graph.node:
+        if node.name.split("_Q")[0] not in quantize_config:
+            continue
+        if node.op_type in ["MatMulFpQ4", "MatMulNBits"]:
+            optype = "MatMul"
+        else:
+            optype = node.op_type
+
+        if optype not in res:
+            res[optype] = {}
+        if re.fullmatch("^.*_Q\d*G\d*", node.input[1]):
+            search_out = re.search("_Q\d*", node.input[1])
+            dtype = "A32W{}G{}".format(
+                node.input[1][search_out.start() + 2 : search_out.end()], node.input[1][search_out.end() + 1 :]
+            )
+        else:
+            dtype = "FP32"
+        dtype_set.add(dtype)
+
+        if dtype in res[optype]:
+            res[optype][dtype] += 1
+        else:
+            res[optype][dtype] = 1
+
+    dtype_list = list(dtype_set)
+    for dtype in dtype_list:
+        for optype in res.keys():
+            if dtype not in res[optype]:
+                res[optype][dtype] = 0
+
+    # update stats format for dump.
+    field_names = ["Op Type", "Total"]
+    field_names.extend(dtype_list)
+    output_data = []
+    for op_type in res.keys():
+        field_results = [op_type, sum(res[op_type].values())]
+        field_results.extend([res[op_type][dtype] for dtype in dtype_list])
+        output_data.append(field_results)
+
+    utility.Statistics(output_data, header="Mixed Precision Statistics", field_names=field_names).print_stat()
+
+
+def get_node_original_name(node) -> str:
+    """Get the original name of the given node."""
+    node_name: str = node.name
+    # TODO how to handle the unquantized node that has the `_quant` suffix, such as `conv_quant`?
+    if node_name.endswith(QUANT_OP_NAME_SUFFIX):
+        return node_name[: -len(QUANT_OP_NAME_SUFFIX)]
+    else:
+        # For unquantized nodes
+        return node_name
+
+
+def split_shared_bias(model):
+    """Split shared tensor."""
+    input_name_to_nodes = model.input_name_to_nodes()
+    for input_name, node_list in input_name_to_nodes.items():
+        if len(node_list) > 1 and input_name in [i.name for i in model.model.graph.initializer]:
+            for node in node_list[1:]:
+                if node.op_type not in ["Conv", "FusedConv"]:
+                    continue
+                if len(node.input) > 2 and node.input[2] == input_name:
+                    new_input_name = node.input[2] + "_nc_split_" + node.name
+                    new_input = onnx.helper.make_tensor(
+                        new_input_name,
+                        model.get_initializer(input_name).data_type,
+                        model.get_initializer(input_name).dims,
+                        model.get_initializer(input_name).raw_data,
+                        True,
+                    )
+                    model.add_initializer(new_input)
+                    node.input[2] = new_input_name
+    return model
+
+
+def remove_init_from_model_input(model):
+    """Remove initializer from model input."""
+    inputs = model.model.graph.input
+    name_to_input = {}
+    for inp in inputs:
+        name_to_input[inp.name] = inp
+    for initializer in model.model.graph.initializer:
+        if initializer.name in name_to_input:
+            inputs.remove(name_to_input[initializer.name])
+
+
+class QuantizedValue:
+    """Represents a linearly quantized value (input/output/initializer)."""
+
+    def __init__(
+        self,
+        name,
+        new_quantized_name,
+        scale_name,
+        zero_point_name,
+        axis=None,
+        qType=1,
+    ):
+        """Initialization.
+
+        Args:
+            name (string): tensor name
+            new_quantized_name (string): quantized tensor name
+            scale_name (string): scale name
+            zero_point_name (string): zero point name
+            axis (int, optional): quantized axis. Defaults to None.
+            qType (int, optional): quantized data type. Defaults to 1 (uint8).
+        """
+        self.name = name
+        self.q_name = new_quantized_name
+        self.scale_name = scale_name
+        self.zp_name = zero_point_name
+        self.axis = axis
+        self.qType = qType
+
+
+class QuantizedInitializer:
+    """Represents a linearly quantized weight input from ONNX operators."""
+
+    def __init__(
+        self,
+        name,
+        initializer,
+        rmins,
+        rmaxs,
+        zero_points,
+        scales,
+        data=[],
+        quantized_data=[],
+        axis=None,
+        qType=1,
+    ):
+        """Initialization.
+
+        Args:
+            name (string): initializer name
+            initializer (onnx.onnx_ml_pb2.TensorProto): initializer
+            rmins (list): list of min value
+            rmaxs (list): list of max value
+            zero_points (list): list of zero point
+            scales (list): list of scale
+            data (list, optional): array version of the initializer. Defaults to [].
+            quantized_data (list, optional): quantized data. Defaults to [].
+            axis (int, optional): quantized axis. Defaults to None.
+            qType (int, optional): quantized data type. Defaults to 1 (uint8).
+        """
+        self.name = name
+        self.initializer = initializer  # TensorProto initializer in ONNX graph
+        self.rmins = rmins  # List of minimum range for each axis
+        self.rmaxs = rmaxs  # List of maximum range for each axis
+        # 1D tensor of zero points computed for each axis. scalar if axis is empty
+        self.zero_points = zero_points
+        self.scales = scales  # 1D tensor of scales computed for each axis. scalar if axis is empty
+        self.data = data  # original data from initializer TensorProto
+        self.quantized_data = quantized_data  # weight-packed data from data
+        # Scalar to specify which dimension in the initializer to weight pack.
+        self.axis = axis
+        # If empty, single zero point and scales computed from a single rmin and rmax
+        self.qType = qType
+
+
+def dump_model_op_stats(model, quantize_config, fp32_op_list):
+    qdq_ops = ["QuantizeLinear", "DequantizeLinear", "DynamicQuantizeLinear"]
+    res = {}
+    for op_type in fp32_op_list:
+        res[op_type] = {"INT8": 0, "FP32": 0}
+    for op_type in qdq_ops:
+        res[op_type] = {"INT8": 0, "FP32": 0}
+
+    for node in model.graph.node:
+        if node.name.endswith("_quant"):
+            if node.op_type.startswith("QLinear"):
+                origin_op_type = node.op_type.split("QLinear")[-1]
+            else:
+                origin_op_type = node.op_type.split("Integer")[0]
+
+            if origin_op_type in ["QAttention", "QGemm"]:
+                origin_op_type = origin_op_type[1:]
+            elif origin_op_type == "DynamicQuantizeLSTM":
+                origin_op_type = "LSTM"
+            elif origin_op_type == "QEmbedLayerNormalization":
+                origin_op_type = "EmbedLayerNormalization"
+            res[origin_op_type]["INT8"] += 1
+
+        elif node.op_type in qdq_ops:
+            res[node.op_type]["INT8"] += 1
+
+        elif node.op_type in res:
+            res[node.op_type]["FP32"] += 1
+
+    field_names = ["Op Type", "Total", "INT8", "FP32"]
+    output_data = [
+        [
+            op_type,
+            sum(res[op_type].values()),
+            res[op_type]["INT8"],
+            res[op_type]["FP32"],
+        ]
+        for op_type in res.keys()
+    ]
+
+    utility.Statistics(output_data, header="Quantization Statistics", field_names=field_names).print_stat()
diff --git a/onnx_neural_compressor/algorithms/weight_only/awq.py b/onnx_neural_compressor/algorithms/weight_only/awq.py
index 30d9e8442..9e07b45a6 100644
--- a/onnx_neural_compressor/algorithms/weight_only/awq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/awq.py
@@ -24,9 +24,9 @@
 import onnxruntime as ort
 from packaging import version
 
-from onnx_neural_compressor import config, constants, data_reader, logger, onnx_model, utility
+from onnx_neural_compressor import constants, data_reader, logger, onnx_model
+from onnx_neural_compressor.algorithms import utility as quant_utils
 from onnx_neural_compressor.algorithms.weight_only import rtn
-from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility
 
 from typing import List, Union  # isort: skip
 
@@ -39,7 +39,7 @@ def _get_weight_scale(weight, group_size):
     return scale
 
 
-def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, scheme):
+def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts):
     """Apply scale for salient weight."""
     best_scales = {}
     new_init_tensors = []
@@ -48,6 +48,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
     updated_nodes = []
     base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
 
+    input_name_to_nodes = model.input_name_to_nodes()
     for parent, nodes in absorb_pairs.items():
         if any([node.input[0] not in output_dicts for node in nodes]):
             logger.warning(
@@ -61,14 +62,17 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
         dtype = None
         weight = []
         org_out = []
+
+        num_bits = weight_config[nodes[0].name].get("weight_bits", 4)
+        group_size = weight_config[nodes[0].name].get("weight_group_size", 32)
+        sym = weight_config[nodes[0].name].get("weight_sym", True)
+        accuracy_level = weight_config[nodes[0].name].get("accuracy_level", 0)
+
+        # use same params for all children of one parent
         for node in nodes:
-            if (node.name, node.op_type) in weight_config and weight_config.get(
-                (node.name, node.op_type), "fp32"
-            ) != "fp32":
-                num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4)
-                group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32)
-                scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym"
-                break
+            weight_config.setdefault(node.name, {}).update({"weight_bits": num_bits})
+            weight_config.setdefault(node.name, {}).update({"weight_group_size": group_size})
+            weight_config.setdefault(node.name, {}).update({"weight_sym": sym})
 
         # search scale
         best_error = float("inf")
@@ -80,9 +84,6 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
             ratio = ratio * 1 / n_grid
             loss = 0
             for node in nodes:
-                if weight_config.get((node.name, node.op_type), {}) == "fp32":
-                    continue
-
                 weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir)
                 if len(weight.shape) != 2:
                     continue
@@ -95,7 +96,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
                 scales = np.clip(np.power(inp_scale, ratio) / np.power(w_scale, (1 - ratio)), 1e-4, None)
                 scales = scales / np.sqrt(np.max(scales) * np.min(scales))
                 weight = weight.T * scales
-                weight = woq_utility.pad_tensor(weight, group_size, (org_w_shape[0] + group_size - 1) // group_size).T
+                weight = quant_utils.pad_tensor(weight.T, group_size, (org_w_shape[0] + group_size - 1) // group_size)
 
                 if (version.Version(ort.__version__) > constants.ONNXRT1161_VERSION and num_bits == 4) or (
                     version.Version(ort.__version__) >= constants.ONNXRT116_VERSION
@@ -104,16 +105,20 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
                 ):  # pragma: no cover
                     # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
                     # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
-                    q_weight = woq_utility.qdq_tensor(weight, num_bits, group_size, scheme, "uint") / np.expand_dims(
-                        scales, axis=-1
-                    )
+                    q_weight = quant_utils.qdq_data(
+                        weight.reshape((-1, group_size)),
+                        "uint" + str(num_bits),
+                        sym,
+                    ).reshape(weight.shape)
                 else:
-                    q_weight = woq_utility.qdq_tensor(weight, num_bits, group_size, scheme, "int") / np.expand_dims(
-                        scales, axis=-1
-                    )
-
-                q_weight = np.reshape(q_weight, (org_w_shape[1], -1))[:, : org_w_shape[0]]
-                out = np.matmul(inp, q_weight.T)
+                    q_weight = quant_utils.qdq_data(
+                        weight.reshape((-1, group_size)),
+                        "int" + str(num_bits),
+                        sym,
+                    ).reshape(weight.shape)
+
+                q_weight = q_weight[: org_w_shape[0], :] / np.expand_dims(scales, axis=-1)
+                out = np.matmul(inp, q_weight)
                 loss += np.mean(np.power((org_out - out), 2))
 
             is_best = loss < best_error
@@ -123,10 +128,6 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
                 best_scale = scales
 
         for node in nodes:
-            weight_config.setdefault((node.name, node.op_type), {}).update({"weight_bits": num_bits})
-            weight_config.setdefault((node.name, node.op_type), {}).update({"weight_group_size": group_size})
-            weight_config.setdefault((node.name, node.op_type), {}).update({"weight_sym": scheme == "sym"})
-
             init_share_num = model.get_initializer_share_num(node.input[1])
             weight_tensor = model.get_initializer(node.input[1])
             tensor = onnx.numpy_helper.to_array(weight_tensor, base_dir)
@@ -136,7 +137,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
 
             new_tensor = onnx.helper.make_tensor(
                 name=node.input[1] + "_scaled",
-                data_type=utility.dtype_mapping[str(dtype)],
+                data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype),
                 dims=tensor.shape,
                 vals=tensor.tobytes(),
                 raw=True,
@@ -152,7 +153,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
             continue
 
         if parent.op_type in ["LayerNormalization", "BatchNormalization", "InstanceNormalization"] and len(
-            model.input_name_to_nodes()[nodes[0].input[0]]
+            input_name_to_nodes[nodes[0].input[0]]
         ) == len(nodes):
             for idx in [1, 2]:
                 tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[idx]), base_dir)
@@ -165,7 +166,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
         elif (
             parent.op_type in ["SimplifiedLayerNormalization", "MatMul", "Gemm", "Mul"]
             and not all([model.get_initializer(inp) is None for inp in parent.input])
-            and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len(nodes)
+            and len(input_name_to_nodes[nodes[0].input[0]]) == len(nodes)
         ):  # pragma: no cover
             for inp in parent.input:
                 if model.get_initializer(inp) is not None:
@@ -176,7 +177,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
             updated_nodes.append(parent.name)
             output_dicts[parent.output[0]] = output_dicts[parent.output[0]] / np.reshape(best_scale, (1, -1))
 
-        elif parent.op_type in ["Conv", "FusedConv"] and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len(
+        elif parent.op_type in ["Conv", "FusedConv"] and len(input_name_to_nodes[nodes[0].input[0]]) == len(
             nodes
         ):  # pragma: no cover
             tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[2]), base_dir)
@@ -190,7 +191,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
             # insert mul
             scale_tensor = onnx.helper.make_tensor(
                 name=parent.output[0] + "_weight_only_scale",
-                data_type=utility.dtype_mapping[str(dtype)],
+                data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype),
                 dims=best_scale.shape,
                 vals=(1.0 / best_scale).flatten().tolist(),
             )
@@ -216,7 +217,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
     return model, output_dicts
 
 
-def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, scheme):
+def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts):
     """Apply clip for weight by checking mse."""
     base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
     ratios = {}
@@ -232,18 +233,17 @@ def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits,
         inp = np.concatenate(output_dicts[nodes[0].input[0]], axis=0)
 
         for node in nodes:
-            if (node.name, node.op_type) in weight_config:
-                num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4)
-                group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32)
-                scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym"
+            num_bits = weight_config[node.name].get("weight_bits", 4)
+            group_size = weight_config[node.name].get("weight_group_size", 32)
+            sym = weight_config[node.name].get("weight_sym", True)
+            accuracy_level = weight_config[node.name].get("accuracy_level", 0)
 
             org_weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir=base_dir)
             org_w_shape = org_weight.shape  # ic, oc
             group_size = group_size if group_size != -1 else org_w_shape[0]
             org_out = np.matmul(inp, org_weight)  # n_token, oc
-
             k_blocks = (org_w_shape[0] - 1) // group_size + 1
-            org_weight = woq_utility.pad_tensor(org_weight, group_size, k_blocks)
+            org_weight = quant_utils.pad_tensor(org_weight, group_size, k_blocks)
 
             org_weight = np.transpose(org_weight)
 
@@ -259,15 +259,21 @@ def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits,
                 ):  # pragma: no cover
                     # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
                     # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
-                    weight = woq_utility.qdq_tensor(
-                        weight, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)
-                    )
+                    weight = quant_utils.qdq_data(
+                        weight.reshape((-1, group_size)),
+                        "uint" + str(num_bits),
+                        sym,
+                        ratio=ratio,
+                    ).reshape(org_weight.shape)
                 else:
-                    weight = woq_utility.qdq_tensor(
-                        weight, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1)
-                    )
-                weight = np.reshape(weight, (org_w_shape[1], -1))[:, : org_w_shape[0]]
-                cur_out = np.matmul(inp, weight.T)
+                    weight = quant_utils.qdq_data(
+                        weight.reshape((-1, group_size)),
+                        "int" + str(num_bits),
+                        sym,
+                        ratio=ratio,
+                    ).reshape(org_weight.shape)
+
+                cur_out = np.matmul(inp, weight[:, : org_w_shape[0]].T)
                 loss = np.mean(np.power((org_out - cur_out), 2))
                 is_best = loss < best_error
                 if is_best:
@@ -281,12 +287,8 @@ def awq_quantize(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
     data_reader: data_reader.CalibrationDataReader,
     weight_config: dict = {},
-    num_bits: int = 4,
-    group_size: int = 32,
-    scheme: str = "asym",
     enable_auto_scale: bool = True,
     enable_mse_search: bool = True,
-    accuracy_level: int = 0,
     providers: List[str] = ["CPUExecutionProvider"],
 ) -> onnx.ModelProto:
     """Quant the model with Activation-aware Weight quantization(AWQ) method.
@@ -306,16 +308,10 @@ def awq_quantize(
                     'accuracy_level': 0
                 }
             }. Defaults to {}.
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): size of weight groups. Defaults to 32.
-        scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym".
         enable_auto_scale (bool, optional): whether to search for best scales based on activation
             distribution. Defaults to True.
         enable_mse_search (bool, optional): whether to search for the best clip range from range
             [0.91, 1.0, 0.01]. Defaults to True.
-        accuracy_level (int, optional): accuracy level. Support 0 (unset),
-            1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel),
-            3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
 
     Returns:
@@ -327,7 +323,7 @@ def awq_quantize(
     full_ratio = {}
 
     if enable_mse_search:
-        inputs, so = woq_utility.prepare_inputs(model, data_reader, providers)
+        inputs, so = quant_utils.prepare_inputs(model, data_reader, providers)
         del data_reader
 
         org_output = copy.deepcopy(model.model.graph.output)
@@ -341,7 +337,7 @@ def awq_quantize(
             if (
                 node.op_type in ["MatMul"]
                 and model.get_initializer(node.input[1]) is not None
-                and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32"
+                and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32"
             ):
                 output_names.append(node.input[0])
         output_names = list(set(output_names))
@@ -361,18 +357,20 @@ def awq_quantize(
             else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
         )
 
+        output_name_to_node = model.output_name_to_node()
+        input_name_to_nodes = model.input_name_to_nodes()
         for input_name in output_names:
-            parent = model.output_name_to_node()[input_name]
+            parent = output_name_to_node[input_name]
             dump_pairs = {parent.name: []}
 
-            for node in model.input_name_to_nodes()[input_name]:
+            for node in input_name_to_nodes[input_name]:
                 # check op_type of node is MatMul
                 # check dim 1 of input is weight tensor
                 # check weight_type is not "fp32"
                 if (
                     node.op_type in ["MatMul"]
                     and model.get_initializer(node.input[1]) is not None
-                    and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32"
+                    and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32"
                 ):
                     dump_pairs[parent.name].append(model.get_node(node.name))
 
@@ -390,9 +388,6 @@ def awq_quantize(
                     weight_config,
                     dump_pairs,
                     output_dicts,
-                    num_bits,
-                    group_size,
-                    scheme,
                 )
             if enable_mse_search:
                 ratios = _apply_awq_clip(
@@ -400,9 +395,6 @@ def awq_quantize(
                     weight_config,
                     dump_pairs,
                     output_dicts,
-                    num_bits,
-                    group_size,
-                    scheme,
                 )
             del output_dicts
             del dump_pairs
@@ -410,7 +402,7 @@ def awq_quantize(
 
         model.remove_tensors_from_outputs(output_names)
         model.model.graph.output.MergeFrom(org_output)
-    model = rtn.rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio, accuracy_level, providers)
+    model = rtn.rtn_quantize(model, weight_config, full_ratio, providers)
     return model
 
 
@@ -418,6 +410,9 @@ def apply_awq_on_model(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
     quant_config: dict,
     calibration_data_reader: data_reader.CalibrationDataReader,
+    enable_auto_scale: bool = True,
+    enable_mse_search: bool = True,
+    providers: List[str] = ["CPUExecutionProvider"],
 ) -> onnx.ModelProto:
     """Apply Activation-aware Weight quantization(AWQ) on onnx model.
 
@@ -430,12 +425,11 @@ def apply_awq_on_model(
         onnx.ModelProto: quantized onnx model.
     """
     # set model params
-    kwargs = {}
-    kwargs = {key: quant_config.pop(key) for key in config.AWQConfig.model_params_list if key in quant_config}
-
-    # change op config to dict type
-    for op_name_type, op_config in quant_config.items():
-        if isinstance(op_config, config.AWQConfig):
-            quant_config[op_name_type] = op_config.to_dict()
-
-    return awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs)
+    kwargs = {
+        "enable_auto_scale": enable_auto_scale,
+        "enable_mse_search": enable_mse_search,
+        "providers": providers,
+    }
+    q_model = awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs)
+    quant_utils.dump_woq_stats(q_model, quant_config)
+    return q_model
diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py
index 5016a2780..ae3813280 100644
--- a/onnx_neural_compressor/algorithms/weight_only/gptq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py
@@ -24,9 +24,10 @@
 import onnxruntime as ort
 from packaging.version import Version
 
-from onnx_neural_compressor import config, constants, data_reader, onnx_model, utility
+from onnx_neural_compressor import constants, data_reader, onnx_model, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
 from onnx_neural_compressor.algorithms.layer_wise import core
-from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility
+from onnx_neural_compressor.quantization import config
 
 from typing import List, Union  # isort: skip
 
@@ -36,8 +37,8 @@ def _gptq(
     H: np.array,
     num_bits: int = 4,
     group_size: int = 32,
-    scheme: str = "asym",
-    blocksize: int = 128,
+    sym: bool = False,
+    block_size: int = 128,
     percdamp: float = 0.01,
     actorder: bool = False,
     mse: bool = False,
@@ -50,8 +51,8 @@ def _gptq(
         H (np.array): Hessian matrix.
         num_bits (int, optional): num_bits. Default is 4.
         group_size (int, optional): how many elements share one scale/zp. Default is 32.
-        scheme (str, optional): sym or asym. Defaults to "asym".
-        blocksize (int, optional): blocksize to quantize weight.
+        sym (bool, optional): sym or asym. Defaults to False.
+        block_size (int, optional): block_size to quantize weight.
         percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
         actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
         mse (bool, optional): whether get scale and zero point with mse error.
@@ -74,7 +75,7 @@ def find_params(weight):
         tmp = np.zeros(weight.shape[1])
         xmin = np.minimum(np.min(weight, axis=0), tmp)
         xmax = np.maximum(np.max(weight, axis=0), tmp)
-        if scheme == "sym":
+        if sym:
             xmax = np.maximum(np.abs(xmin), xmax)
             tmp = xmin < 0
             if np.any(tmp):
@@ -84,7 +85,7 @@ def find_params(weight):
         xmax[tmp] = +1
 
         scale = (xmax - xmin) / maxq
-        if scheme == "sym":
+        if sym:
             zero = np.ones(scale.shape) * (maxq + 1) / 2
         else:
             zero = np.round(-xmin / scale)
@@ -95,7 +96,7 @@ def find_params(weight):
                 xmin1 = p * xmin
                 xmax1 = p * xmax
                 scale1 = (xmax1 - xmin1) / maxq
-                zero1 = np.round(-xmin1 / scale1) if scheme != "sym" else zero
+                zero1 = np.round(-xmin1 / scale1) if not sym else zero
                 q = np.clip(np.round(weight / scale1) + zero1, 0, maxq)
                 q -= weight
                 q = np.power(np.abs(q), norm)
@@ -134,8 +135,8 @@ def find_params(weight):
     H[diag, diag] += damp  # add a average value of
     H = np.linalg.cholesky(np.linalg.inv(H)).T
     Hinv = H
-    for i1 in range(0, shape[0], blocksize):
-        i2 = min(i1 + blocksize, shape[0])
+    for i1 in range(0, shape[0], block_size):
+        i2 = min(i1 + block_size, shape[0])
         count = i2 - i1
 
         W1 = copy.deepcopy(W[i1:i2, :])
@@ -178,15 +179,11 @@ def gptq_quantize(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
     data_reader: data_reader.CalibrationDataReader,
     weight_config: dict = {},
-    num_bits: int = 4,
-    group_size: int = 32,
-    scheme: str = "asym",
     percdamp: float = 0.01,
-    blocksize: int = 128,
+    block_size: int = 128,
     actorder: bool = False,
     mse: bool = False,
     perchannel: bool = True,
-    accuracy_level: int = 0,
     providers: List[str] = ["CPUExecutionProvider"],
     return_modelproto: bool = True,
 ):
@@ -206,19 +203,13 @@ def gptq_quantize(
                         'weight_sym': True,
                         'accuracy_level': 0
                     }. Defaults to {}.
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): size of weight groups. Defaults to 32.
-        scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym".
         percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added
             to Hessian's diagonal to increase numerical stability. Defaults to 0.01.
-        blocksize (int, optional): execute GPTQ quantization per block. Defaults to 128.
+        block_size (int, optional): execute GPTQ quantization per block. Defaults to 128.
         actorder (bool, optional): whether to sort Hessian's diagonal values to rearrange channel-wise
             quantization order. Defaults to False.
         mse (bool, optional): whether get scale and zero point with mse error. Defaults to False.
         perchannel (bool, optional): whether quantize weight per-channel. Defaults to True.
-        accuracy_level (int, optional): accuracy level. Support 0 (unset),
-            1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel),
-            3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
         return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant.
             Default to True
@@ -230,7 +221,7 @@ def gptq_quantize(
         model = onnx_model.ONNXModel(model)
     base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
 
-    inputs, so = woq_utility.prepare_inputs(model, data_reader, providers)
+    inputs, so = quant_utils.prepare_inputs(model, data_reader, providers)
     del data_reader
     org_output = copy.deepcopy(model.model.graph.output)
     model.remove_tensors_from_outputs([i.name for i in org_output])
@@ -242,7 +233,7 @@ def gptq_quantize(
         if (
             node.op_type in ["MatMul"]
             and model.get_initializer(node.input[1]) is not None
-            and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32"
+            and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32"
         ):
             output_names.append(node.input[0])
     output_names = list(set(output_names))
@@ -262,19 +253,21 @@ def gptq_quantize(
         else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
     )
 
+    input_name_to_nodes = model.input_name_to_nodes()
+
     for idx, input_name in enumerate(output_names):
         utility.simple_progress_bar(len(output_names), idx + 1)
         node_list = []
         weights = []
 
-        for node in model.input_name_to_nodes()[input_name]:
+        for node in input_name_to_nodes[input_name]:
             # check op_type of node is MatMul
             # check dim 1 of input is weight tensor
             # check weight_type is not "fp32"
             if (
                 node.op_type in ["MatMul"]
                 and model.get_initializer(node.input[1]) is not None
-                and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32"
+                and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32"
             ):
                 weight = onnx.numpy_helper.to_array(
                     model.get_initializer(model.get_node(node.name).input[1]), base_dir
@@ -304,11 +297,10 @@ def gptq_quantize(
             weight,
             H,
         ) in zip(node_list, weights, Hs):
-            if (node.name, node.op_type) in weight_config:
-                num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4)
-                group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32)
-                scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym"
-                accuracy_level = weight_config[(node.name, node.op_type)].get("accuracy_level", 0)
+            num_bits = weight_config[node.name].get("weight_bits", 4)
+            group_size = weight_config[node.name].get("weight_group_size", 32)
+            sym = weight_config[node.name].get("weight_sym", True)
+            accuracy_level = weight_config[node.name].get("accuracy_level", 0)
             group_size = group_size if group_size != -1 else weight.shape[0]
             dtype = weight.dtype
 
@@ -317,8 +309,8 @@ def gptq_quantize(
                 H,
                 num_bits=num_bits,
                 group_size=group_size,
-                scheme=scheme,
-                blocksize=blocksize,
+                sym=sym,
+                block_size=block_size,
                 percdamp=percdamp,
                 actorder=actorder,
                 mse=mse,
@@ -340,10 +332,14 @@ def gptq_quantize(
                 # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP
                 org_shape = weight.shape
                 k_blocks = (org_shape[0] + group_size - 1) // group_size
-                q_weight = woq_utility.pad_tensor(q_weight, group_size, k_blocks)
-                q_weight, scale, zp = woq_utility.quant_tensor(q_weight.T, num_bits, group_size, scheme, "uint")
-
-                q_matmul_node, new_inits = woq_utility.make_matmul_weight_only_node(
+                q_weight = quant_utils.pad_tensor(q_weight, group_size, k_blocks)
+                _, _, zp, scale, q_weight = quant_utils.quantize_data(
+                    q_weight.T,
+                    "uint" + str(num_bits),
+                    sym,
+                    axis=1,
+                )
+                q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node(
                     node=node,
                     weight_shape=org_shape,
                     num_bits=num_bits,
@@ -351,7 +347,7 @@ def gptq_quantize(
                     k_blocks=k_blocks,
                     q_weight=q_weight.astype("uint8"),
                     scale=scale.astype(dtype),
-                    zero_point=zp if scheme == "asym" else None,
+                    zero_point=zp if not sym else None,
                     accuracy_level=accuracy_level,
                 )
 
@@ -361,7 +357,7 @@ def gptq_quantize(
             else:
                 q_weight_tensor = onnx.helper.make_tensor(
                     name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
-                    data_type=utility.dtype_mapping[str(dtype)],
+                    data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype),
                     dims=q_weight.shape,
                     vals=q_weight.astype(dtype).tobytes(),
                     raw=True,
@@ -391,6 +387,13 @@ def apply_gptq_on_model(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
     quant_config: dict,
     calibration_data_reader: data_reader.CalibrationDataReader,
+    percdamp: float = 0.01,
+    block_size: int = 128,
+    actorder: bool = False,
+    mse: bool = False,
+    perchannel: bool = True,
+    providers: List[str] = ["CPUExecutionProvider"],
+    layer_wise_quant: bool = False,
 ) -> onnx.ModelProto:
     """Apply GPTQ on onnx model.
 
@@ -402,18 +405,17 @@ def apply_gptq_on_model(
     Returns:
         onnx.ModelProto: quantized onnx model.
     """
-    # check whether to do layer_wise quant
-    layer_wise = quant_config.pop("layer_wise_quant", False)
-
     # set other model params
-    quant_kwargs = {}
-    quant_kwargs = {key: quant_config.pop(key) for key in config.GPTQConfig.model_params_list if key in quant_config}
-
-    # change op config to dict type
-    for op_name_type, op_config in quant_config.items():
-        if isinstance(op_config, config.GPTQConfig):
-            quant_config[op_name_type] = op_config.to_dict()
-    if layer_wise:
+    quant_kwargs = {
+        "percdamp": percdamp,
+        "block_size": block_size,
+        "actorder": actorder,
+        "mse": mse,
+        "perchannel": perchannel,
+        "providers": providers,
+    }
+
+    if layer_wise_quant:
         quantized_model = core.layer_wise_quant(
             model,
             quant_func=gptq_quantize,
@@ -428,4 +430,5 @@ def apply_gptq_on_model(
 
     if isinstance(quantized_model, onnx_model.ONNXModel):
         quantized_model = quantized_model.model
+    quant_utils.dump_woq_stats(quantized_model, quant_config)
     return quantized_model
diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py
index 619c055e1..18fdc1e47 100644
--- a/onnx_neural_compressor/algorithms/weight_only/rtn.py
+++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py
@@ -1,10 +1,7 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
 # Copyright (c) 2023 MIT HAN Lab
 # This source code is licensed under the MIT license
 #
-# Copyright (c) 2023 Intel Corporation
+# Copyright (c) 2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,9 +23,9 @@
 import onnxruntime as ort
 from packaging import version
 
-from onnx_neural_compressor import config, constants, onnx_model, utility
+from onnx_neural_compressor import constants, onnx_model, utility
+from onnx_neural_compressor.algorithms import utility as quant_utils
 from onnx_neural_compressor.algorithms.layer_wise import core
-from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility
 
 from typing import List, Union  # isort: skip
 
@@ -36,11 +33,7 @@
 def rtn_quantize(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
     weight_config: dict = {},
-    num_bits: int = 4,
-    group_size: int = 32,
-    scheme: str = "asym",
     ratios: dict = {},
-    accuracy_level: int = 0,
     providers: List[str] = ["CPUExecutionProvider"],
     return_modelproto: bool = True,
 ):
@@ -60,14 +53,7 @@ def rtn_quantize(
                         'accuracy_level': 0
                     }
             }. Defaults to {}.
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): size of weight groups. Defaults to 32.
-        scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym".
         ratios (dict, optional): percentile of clip. Defaults to {}.
-        accuracy_level (int, optional):
-            accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
-            2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
-            4 (int8 compute type of jblas kernel). Defaults to 0.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
         return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant.
             Default to True
@@ -92,7 +78,7 @@ def rtn_quantize(
         if (
             node.op_type in ["MatMul"]  # check op_type of node is MatMul
             and model.get_initializer(node.input[1]) is not None
-            and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32"
+            and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32"
         ):
             weight_tensor = model.get_initializer(node.input[1])
             weight = onnx.numpy_helper.to_array(weight_tensor, base_dir=base_dir).copy()
@@ -100,11 +86,10 @@ def rtn_quantize(
                 continue
 
             dtype = weight.dtype
-            if (node.name, node.op_type) in weight_config:
-                num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4)
-                group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32)
-                scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym"
-                accuracy_level = weight_config[(node.name, node.op_type)].get("accuracy_level", 0)
+            num_bits = weight_config[node.name].get("weight_bits", 4)
+            group_size = weight_config[node.name].get("weight_group_size", 32)
+            sym = weight_config[node.name].get("weight_sym", True)
+            accuracy_level = weight_config[node.name].get("accuracy_level", 0)
 
             org_w_shape = weight.shape  # ic, oc
             group_size = group_size if group_size != -1 else org_w_shape[0]
@@ -112,7 +97,7 @@ def rtn_quantize(
             k_blocks = (org_w_shape[0] - 1) // group_size + 1
             init_share_num = model.get_initializer_share_num(node.input[1])
 
-            weight = woq_utility.pad_tensor(weight, group_size, k_blocks)
+            weight = quant_utils.pad_tensor(weight, group_size, k_blocks)
 
             satisfy_MatMulNBits_condition = (
                 version.Version(ort.__version__) > constants.ONNXRT1161_VERSION and num_bits == 4
@@ -126,10 +111,14 @@ def rtn_quantize(
             ):  # pragma: no cover
                 # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP
                 # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP
-                q_weight, scale, zp = woq_utility.quant_tensor(
-                    weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)
+                _, _, zp, scale, q_weight = quant_utils.quantize_data(
+                    weight.T.reshape((-1, group_size)),
+                    "uint" + str(num_bits),
+                    sym,
+                    ratio=ratios.get(node.input[1], 1),
+                    axis=1,
                 )
-                q_matmul_node, new_inits = woq_utility.make_matmul_weight_only_node(
+                q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node(
                     node=node,
                     weight_shape=org_w_shape,
                     num_bits=num_bits,
@@ -137,7 +126,7 @@ def rtn_quantize(
                     k_blocks=k_blocks,
                     q_weight=q_weight.astype("uint8"),
                     scale=scale.astype(dtype),
-                    zero_point=zp if scheme == "asym" else None,
+                    zero_point=zp if not sym else None,
                     accuracy_level=accuracy_level,
                 )
 
@@ -145,15 +134,19 @@ def rtn_quantize(
                 remove_nodes.append(node)
                 new_nodes.append(q_matmul_node)
             else:
-                q_weight = woq_utility.qdq_tensor(
-                    weight.T, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1)
+                q_weight = quant_utils.qdq_data(
+                    weight.T.reshape((-1, group_size)),
+                    "int" + str(num_bits),
+                    sym,
+                    ratio=ratios.get(node.input[1], 1),
+                    axis=1,
                 )
                 q_weight = np.reshape(q_weight, (org_w_shape[1], -1))
                 q_weight = np.transpose(q_weight)
                 q_weight = q_weight[: org_w_shape[0], :].astype(dtype)
                 q_weight_tensor = onnx.helper.make_tensor(
                     name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
-                    data_type=utility.dtype_mapping[str(dtype)],
+                    data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype),
                     dims=weight.shape,
                     vals=q_weight.tobytes(),
                     raw=True,
@@ -178,7 +171,11 @@ def rtn_quantize(
 
 
 def apply_rtn_on_model(
-    model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], quant_config: dict
+    model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
+    quant_config: dict,
+    ratios: dict = {},
+    providers: List[str] = ["CPUExecutionProvider"],
+    layer_wise_quant: bool = False,
 ) -> onnx.ModelProto:
     """Apply RTN on onnx model.
 
@@ -189,19 +186,12 @@ def apply_rtn_on_model(
     Returns:
         onnx.ModelProto: quantized onnx model.
     """
-    # check whether to do layer_wise quant
-    layer_wise = quant_config.pop("layer_wise_quant", False)
-
-    # set other model params
-    quant_kwargs = {}
-    quant_kwargs = {key: quant_config.pop(key) for key in config.RTNConfig.model_params_list if key in quant_config}
-
-    # change op config to dict type
-    for op_name_type, op_config in quant_config.items():
-        if isinstance(op_config, config.RTNConfig):
-            quant_config[op_name_type] = op_config.to_dict()
+    quant_kwargs = {
+        "ratios": ratios,
+        "providers": providers,
+    }
 
-    if layer_wise:
+    if layer_wise_quant:
         quantized_model = core.layer_wise_quant(
             model, quant_func=rtn_quantize, weight_config=quant_config, **quant_kwargs
         )
@@ -210,4 +200,5 @@ def apply_rtn_on_model(
 
     if isinstance(quantized_model, onnx_model.ONNXModel):
         quantized_model = quantized_model.model
+    quant_utils.dump_woq_stats(quantized_model, quant_config)
     return quantized_model
diff --git a/onnx_neural_compressor/algorithms/weight_only/utility.py b/onnx_neural_compressor/algorithms/weight_only/utility.py
deleted file mode 100644
index ddb5f990d..000000000
--- a/onnx_neural_compressor/algorithms/weight_only/utility.py
+++ /dev/null
@@ -1,332 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 MIT HAN Lab
-# This source code is licensed under the MIT license
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import struct
-import sys
-from importlib import util
-
-import numpy as np
-import onnx
-import onnxruntime as ort
-from packaging import version
-
-from onnx_neural_compressor import constants, utility
-
-if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"):  # pragma: no cover
-    import onnxruntime_extensions
-
-
-def _get_blob_size(group_size, has_zp):  # pragma: no cover
-    """Get blob_size.
-
-    Args:
-        group_size (int): how many elements share one scale/zp
-        has_zp (bool): whether zero_point is None
-    """
-    if version.Version(ort.__version__) > constants.ONNXRT1161_VERSION:
-        blob_size = group_size // 2
-    elif has_zp:
-        blob_size = group_size // 2 + 4 + 1
-    else:
-        blob_size = group_size // 2 + 4
-    return blob_size
-
-
-def make_matmul_weight_only_node(
-    node: onnx.NodeProto,
-    weight_shape: tuple,
-    num_bits: int,
-    group_size: int,
-    k_blocks: int,
-    q_weight: np.array,
-    scale: np.array,
-    zero_point: np.array,
-    accuracy_level: int = 0,
-):
-    """Build MatMulFpQ4/MatMulNBits node.
-
-    Args:
-        node (onnx.NodeProto): original matmul node
-        weight_shape (tuple): original weight shape
-        num_bits (int): number of bits used to represent weights.
-        group_size (int): how many elements share one scale/zp
-        k_blocks (int): block number
-        q_weight (np.array): quantized weight
-        scale (np.array): scale
-        zero_point (np.array): zero point
-        accuracy_level (int, optional): accuracy level.
-            Support 0 (unset), 1(fp32 compute type of jblas kernel),
-            2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
-            4 (int8 compute type of jblas kernel) Defaults to 0.
-
-    Returns:
-        matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node
-        new_inits: initializers of the new node
-    """
-    blob_size = _get_blob_size(group_size, zero_point is not None)
-    packed = np.zeros((q_weight.shape[0], blob_size), dtype="uint8")
-    q_weight_name = node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size))
-    input_names = [node.input[0], q_weight_name]
-    new_inits = []
-    kwargs = {}
-
-    if version.Version(ort.__version__) > constants.ONNXRT1161_VERSION:
-        op_type = "MatMulNBits"
-
-        # pack quantized weight
-        q_weight_pairs = q_weight[:, ::2] | q_weight[:, 1::2] << 4
-        packed[:, :] = q_weight_pairs[:, :blob_size]
-        packed = np.reshape(packed, (-1, k_blocks, blob_size))
-
-        # build scale tensor
-        scale = np.reshape(scale, (-1, k_blocks))
-        scale_tensor = onnx.helper.make_tensor(
-            name=node.input[1] + "_scale",
-            data_type=utility.dtype_mapping[str(scale.dtype)],
-            dims=scale.shape,
-            vals=scale.tobytes(),
-            raw=True,
-        )
-        input_names.append(scale_tensor.name)
-        new_inits.append(scale_tensor)
-
-        # build zero_point tensor
-        if zero_point is not None:
-            if num_bits > 4:
-                packed_zp = np.reshape(zero_point, (1, -1)).astype("uint8")
-            else:
-                packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8")
-                # create an index array
-                idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1)
-                # separate odd and even indices
-                even_idx = idx[::2]
-                odd_idx = idx[1::2]
-                # vectorized operation for even and odd indices
-                packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel()
-                packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4)
-
-            zp_tensor = onnx.helper.make_tensor(
-                name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True
-            )
-            input_names.append(zp_tensor.name)
-            new_inits.append(zp_tensor)
-
-        # set kwargs
-        kwargs["K"] = weight_shape[0]
-        kwargs["N"] = weight_shape[1]
-        kwargs["bits"] = num_bits
-        kwargs["block_size"] = group_size
-        if accuracy_level > 0:
-            # require onnxruntime > 1.16.3
-            kwargs["accuracy_level"] = accuracy_level
-
-    else:
-        offset = 5 if zero_point is not None else 4
-        op_type = "MatMulFpQ4"
-
-        # pack quantized weight
-        for i in range(q_weight.shape[0]):
-            bf = struct.pack("f", scale[i])
-            packed[i][0] = bf[0]
-            packed[i][1] = bf[1]
-            packed[i][2] = bf[2]
-            packed[i][3] = bf[3]
-
-            if zero_point is not None:
-                packed[i][4] = zero_point[i]
-
-            packed[i][offset:] = np.bitwise_or(
-                q_weight[i][: group_size // 2], np.left_shift(q_weight[i][group_size // 2 :], num_bits)
-            )
-        packed = packed.reshape(-1)
-
-        # build shape tensor
-        shape_tensor = onnx.helper.make_tensor(
-            name=node.input[1] + "_shape", data_type=7, dims=(2,), vals=np.array(weight_shape, dtype="int64")
-        )
-        new_inits.append(shape_tensor)
-        input_names.append(shape_tensor.name)
-
-        # set kwargs
-        kwargs["blk_quant_type"] = 1 if zero_point is not None else 0
-
-    q_weight_tensor = onnx.helper.make_tensor(
-        name=q_weight_name,
-        data_type=2,
-        dims=packed.shape,
-        vals=packed.tobytes(),
-        raw=True,
-    )
-    new_inits.append(q_weight_tensor)
-
-    matmul_weight_only_node = onnx.helper.make_node(
-        op_type,
-        inputs=input_names,
-        outputs=node.output,
-        name=node.name + "_Q" + str(num_bits) if node.name else "_Q" + str(num_bits),
-        domain="com.microsoft",
-        **kwargs,
-    )
-    return matmul_weight_only_node, new_inits
-
-
-def prepare_inputs(model, data_reader, providers):
-    """Prepare inputs for weight only quantization.
-
-    Args:
-        model (ModelProto or onnx_model.ONNXModel): onnx model.
-        data_reader (CalibrationDataReader): a calibration data reader.
-        providers (list): providers to use.
-
-    Returns:
-        inputs: prepared inputs.
-        so: session options
-    """
-
-    so = ort.SessionOptions()
-    if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"):  # pragma: no cover
-        so.register_custom_ops_library(onnxruntime_extensions.get_library_path())
-    if model.is_large_model:
-        onnx.save_model(
-            model.model,
-            model.model_path + "_augment.onnx",
-            save_as_external_data=True,
-            all_tensors_to_one_file=True,
-            convert_attribute=False,
-        )
-
-    inputs_list = []
-    while True:
-        inputs = data_reader.get_next()
-        if not inputs:
-            break
-        inputs_list.append(inputs)
-    return inputs_list, so
-
-
-def pad_tensor(weight, group_size, k_blocks):
-    """Pad tensor rowi so that it can be is divisible by group_size.
-
-    Args:
-        weight (array): weight
-        group_size (int): how many elements share one scale/zp
-        k_blocks (int): the number of block
-
-    Returns:
-        weight: paded weight
-    """
-    if group_size == -1:
-        return weight
-
-    org_w_shape = weight.shape
-    padded_rows = k_blocks * group_size
-    pad_len = padded_rows - org_w_shape[0]
-
-    if pad_len > 0:
-        weight = np.pad(weight, ((0, pad_len), (0, 0)), "constant")
-
-    return weight
-
-
-def quant_tensor(
-    data: np.array,
-    num_bits: int = 4,
-    group_size: int = 32,
-    scheme: str = "asym",
-    dtype: str = "int",
-    ratio: float = 1.0,
-):
-    """Quantize tensor per group.
-
-    Args:
-        data (np.array): input weight
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
-        scheme (str, optional): _quantization scheme. Defaults to "asym".
-        dtype (str, optional): data type. Defaults to "int".
-        ratio (float, optional): percentile of clip. Defaults to 1.0.
-
-    Returns:
-        output: quantized weight
-        scale: scale
-        zero_point: zero point
-    """
-    data = np.reshape(data, (-1, group_size))
-    if scheme == "asym" or dtype == "uint":
-        maxq = 2**num_bits - 1
-        minq = 0
-    elif scheme == "sym":
-        maxq = 2 ** (num_bits - 1) - 1 if num_bits != 1 else 0
-        minq = -(2 ** (num_bits - 1)) if num_bits != 1 else -1
-
-    rmin = np.min(data, axis=1, keepdims=True) * ratio
-    rmax = np.max(data, axis=1, keepdims=True) * ratio
-    if scheme == "sym":
-        max_range = np.maximum(np.abs(rmin), np.abs(rmax))
-
-        scale = np.ones(rmax.shape)
-        mask = max_range > 0
-        scale[mask] = (max_range[mask] * 2.0).astype(np.float64) / (maxq - minq)
-        zero_point = (
-            np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1))
-        )
-    else:
-        scale = np.ones(rmax.shape)
-        scale[rmin != rmax] = np.array(
-            [float(i) / (maxq - minq) for i in (rmax - rmin)[rmin != rmax].flatten().tolist()]
-        )
-        zero_point = (
-            ((np.zeros(scale.shape) - rmin) / scale).round()
-            if dtype == "int"
-            else np.maximum(0, np.minimum(maxq, ((np.zeros(scale.shape) - rmin) / scale).round())).astype("uint8")
-        )
-    q_weight = np.empty_like(data, dtype=scale.dtype)
-    np.divide(data, scale, out=q_weight)
-    np.add(q_weight, zero_point, out=q_weight)
-    np.round(q_weight, out=q_weight)
-    np.clip(q_weight, minq, maxq, out=q_weight)
-
-    return q_weight, scale, zero_point
-
-
-def qdq_tensor(
-    data: np.array,
-    num_bits: int = 4,
-    group_size: int = 32,
-    scheme: str = "asym",
-    dtype: str = "int",
-    ratio: float = 1.0,
-):
-    """Quant dequant tensor per group.
-
-    Args:
-        data (np.array): input weight
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional):  how many elements share one scale/zp. Defaults to 32.
-        scheme (str, optional): quantization scheme. Defaults to "asym".
-        dtype (str, optional): data type. Defaults to "int".
-        ratio (float, optional): percentile of clip. Defaults to 1.0.
-
-    Returns:
-        output: quant-dequant weight
-    """
-    org_shape = data.shape
-    weight, scale, zp = quant_tensor(data, num_bits, group_size, scheme, dtype, ratio)
-    return np.reshape(scale * (weight - zp), org_shape)
diff --git a/onnx_neural_compressor/config.py b/onnx_neural_compressor/config.py
deleted file mode 100644
index b6fad923a..000000000
--- a/onnx_neural_compressor/config.py
+++ /dev/null
@@ -1,1239 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import enum
-import inspect
-import itertools
-import json
-import pathlib
-import re
-from abc import ABC, abstractmethod
-
-import numpy as np
-import onnx
-import pydantic
-from onnxruntime import quantization
-from typing_extensions import Self
-
-from onnx_neural_compressor import constants, data_reader, logger, utility
-
-from collections import OrderedDict  # isort: skip
-from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, _GenericAlias  # isort: skip
-
-
-class ParamLevel(enum.Enum):
-    OP_LEVEL = enum.auto()
-    OP_TYPE_LEVEL = enum.auto()
-    MODEL_LEVEL = enum.auto()
-
-
-class TuningParam:
-    """Define the tunable parameter for the algorithm.
-
-    Example:
-        Class FakeAlgoConfig(config.BaseConfig):
-            '''Fake algo config.'''.
-
-            params_list = [
-                ...
-                # For simple tunable types, like a list of int, giving
-                # the param name is enough. `config.BaseConfig` class will
-                # create the `TuningParam` implicitly.
-                "simple_attr"
-
-                # For complex tunable types, like a list of lists,
-                # developers need to create the `TuningParam` explicitly.
-                TuningParam("complex_attr", tunable_type=List[List[str]])
-
-                # The default parameter level is `ParamLevel.OP_LEVEL`.
-                # If the parameter is at a different level, developers need
-                # to specify it explicitly.
-                TuningParam("model_attr", level=ParamLevel.MODEL_LEVEL)
-
-            ...
-
-    # TODO: more examples to explain the usage of `TuningParam`.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        default_val: Any = None,
-        tunable_type=None,
-        options=None,
-        level: ParamLevel = ParamLevel.OP_LEVEL,
-    ) -> None:
-        self.name = name
-        self.default_val = default_val
-        self.tunable_type = tunable_type
-        self.options = options
-        self.level = level
-
-    @staticmethod
-    def create_input_args_model(expect_args_type: Any) -> type:
-        """Dynamically create an InputArgsModel based on the provided type hint.
-
-        Parameters:
-        - expect_args_type (Any): The user-provided type hint for input_args.
-
-        Returns:
-        - type: The dynamically created InputArgsModel class.
-        """
-
-        class DynamicInputArgsModel(pydantic.BaseModel):
-            input_args: expect_args_type
-
-        return DynamicInputArgsModel
-
-    def is_tunable(self, value: Any) -> bool:
-        # Use `Pydantic` to validate the input_args.
-        # TODO: refine the implementation in further.
-        assert isinstance(self.tunable_type, _GenericAlias), f"Expected a type hint, got {self.tunable_type} instead."
-        DynamicInputArgsModel = TuningParam.create_input_args_model(self.tunable_type)
-        try:
-            new_args = DynamicInputArgsModel(input_args=value)
-            return True
-        except Exception as e:
-            logger.debug(f"Failed to validate the input_args: {e}")
-            return False
-
-
-# Config registry to store all registered configs.
-class ConfigRegistry(object):
-    registered_configs = {}
-    _config_registry = None
-
-    def __new__(cls) -> Self:
-        if cls._config_registry is None:
-            cls._config_registry = super(ConfigRegistry, cls).__new__(cls)
-
-        return cls._config_registry
-
-    @classmethod
-    def register_config_impl(cls, algo_name: str, priority: Union[float, int] = 0):
-        """Register config decorator.
-
-        The register the configuration classes for different algorithms.
-
-        Usage example:
-            @ConfigRegistry.register_config(algo_name=ExampleAlgorithm, priority=100)
-            class ExampleAlgorithmConfig:
-                # Configuration details for the ExampleAlgorithm
-
-        Args:
-            algo_name: the algorithm name.
-            priority: priority: the priority of the configuration. A larger number indicates a higher priority,
-                which will be tried first at the auto-tune stage. Defaults to 0.
-        """
-
-        def decorator(config_cls):
-            cls.registered_configs[algo_name] = {"priority": priority, "cls": config_cls}
-            return config_cls
-
-        return decorator
-
-    @classmethod
-    def get_all_configs(cls) -> Dict[str, Dict[str, Dict[str, object]]]:
-        """Get all registered configurations."""
-        return cls.registered_configs
-
-    @classmethod
-    def get_sorted_configs(cls) -> Dict[str, OrderedDict[str, Dict[str, object]]]:
-        """Get registered configurations sorted by priority."""
-        return OrderedDict(sorted(cls.registered_configs.items(), key=lambda x: x[1]["priority"], reverse=True))
-
-    @classmethod
-    def get_cls_configs(cls) -> Dict[str, Dict[str, object]]:
-        """Get registered configurations without priority."""
-        cls_configs = {}
-        for algo_name, config_data in cls.registered_configs.items():
-            cls_configs[algo_name] = config_data["cls"]
-        return cls_configs
-
-    @classmethod
-    def get_all_config_cls(cls) -> List[Type[BaseConfig]]:
-        configs_cls = []
-        for algo_name, config_pairs in cls.registered_configs.items():
-            configs_cls.append(config_pairs["cls"])
-        return configs_cls
-
-
-config_registry = ConfigRegistry()
-
-
-def register_config(algo_name: str, priority: Union[float, int] = 0):
-    """Register config decorator.
-
-    The register the configuration classes for different algorithms.
-
-    Usage example:
-        @register_config(algo_name=ExampleAlgorithm, priority=100)
-        class ExampleAlgorithmConfig:
-            # Configuration details for the ExampleAlgorithm
-
-    Args:
-        algo_name: the algorithm name.
-        priority: the priority of the configuration. A larger number indicates a higher priority,
-            which will be tried first at the auto-tune stage. Defaults to 0.
-    """
-
-    return config_registry.register_config_impl(algo_name=algo_name, priority=priority)
-
-
-class BaseConfig(ABC):
-    """The base config for all algorithm configs."""
-
-    name = constants.BASE_CONFIG
-    params_list: List[Union[str, TuningParam]] = []
-
-    def __init__(
-        self,
-        white_list: Optional[Union[Union[str, Callable], List[Union[str, Callable]]]] = constants.DEFAULT_WHITE_LIST,
-    ) -> None:
-        self._global_config: Optional[BaseConfig] = None
-        # For PyTorch, operator_type is the collective name for module type and functional operation type,
-        # for example, `torch.nn.Linear`, and `torch.nn.functional.linear`.
-        # local config is the collections of operator_type configs and operator configs
-        self._local_config: Dict[str, Optional[BaseConfig]] = {}
-        self._white_list = white_list
-
-    def _post_init(self):
-        if self.white_list == constants.DEFAULT_WHITE_LIST:
-            global_config = self.get_params_dict()
-            self._global_config = self.__class__(**global_config, white_list=None)
-        elif isinstance(self.white_list, list) and len(self.white_list) > 0:
-            for op_name_or_type in self.white_list:
-                global_config = self.get_params_dict()
-                tmp_config = self.__class__(**global_config, white_list=None)
-                self.set_local(op_name_or_type, tmp_config)
-        elif self.white_list == constants.EMPTY_WHITE_LIST:
-            return
-        else:
-            raise NotImplementedError(
-                f"The white list should be one of {constants.DEFAULT_WHITE_LIST}, {constants.EMPTY_WHITE_LIST},"
-                " a not empty list, but got {self.white_list}"
-            )
-
-    @property
-    def white_list(self):
-        return self._white_list
-
-    @white_list.setter
-    def white_list(self, op_name_or_type_list: Optional[List[Union[str, Callable]]]):
-        self._white_list = op_name_or_type_list
-
-    @property
-    def global_config(self):
-        return self._global_config
-
-    @global_config.setter
-    def global_config(self, config):
-        self._global_config = config
-
-    @property
-    def local_config(self):
-        return self._local_config
-
-    @local_config.setter
-    def local_config(self, config):
-        self._local_config = config
-
-    def set_local(self, operator_name: str, config: BaseConfig) -> BaseConfig:
-        if operator_name in self.local_config:
-            logger.warning("The configuration for %s has already been set, update it.", operator_name)
-        self.local_config[operator_name] = config
-        return self
-
-    def to_dict(self):
-        result = {}
-        global_config = self.get_params_dict()
-        if bool(self.local_config):
-            result[constants.LOCAL] = {}
-            for op_name, config in self.local_config.items():
-                result[constants.LOCAL][op_name] = config.to_dict()
-            if self.global_config:
-                result[constants.GLOBAL] = global_config
-        else:
-            result = global_config
-        return result
-
-    def get_params_dict(self):
-        result = dict()
-        for param, value in self.__dict__.items():
-            if param not in ["_global_config", "_local_config", "_white_list"]:
-                result[param] = value
-        return result
-
-    @classmethod
-    def from_dict(cls, config_dict):
-        """Construct config from a dict.
-
-        Args:
-            config_dict: _description_
-
-        Returns:
-            The constructed config.
-        """
-        if constants.GLOBAL not in config_dict and constants.LOCAL not in config_dict:
-            config = cls(**config_dict)
-            return config
-        else:
-            config = cls(**config_dict.get(constants.GLOBAL, {}))
-            operator_config = config_dict.get(constants.LOCAL, {})
-            if operator_config:
-                for op_name, op_config in operator_config.items():
-                    config.set_local(op_name, cls(**op_config))
-            return config
-
-    @classmethod
-    def to_diff_dict(cls, instance) -> Dict[str, Any]:
-        # TODO (Yi) to implement it
-        return {}
-
-    @classmethod
-    def from_json_file(cls, filename):
-        with open(filename, "r", encoding="utf-8") as file:
-            config_dict = json.load(file)
-        return cls.from_dict(**config_dict)
-
-    def to_json_file(self, filename):
-        config_dict = self.to_dict()
-        with open(filename, "w", encoding="utf-8") as file:
-            json.dump(config_dict, file, indent=4)
-        logger.info("Dump the config into %s.", filename)
-
-    def to_json_string(self, use_diff: bool = False) -> Union[str, Dict]:
-        """Serializes this instance to a JSON string.
-
-        Args:
-            use_diff (`bool`, *optional*, defaults to `True`):
-                If set to `True`, only the difference between the config instance and the default `BaseConfig()`
-                is serialized to JSON string.
-
-        Returns:
-            `str`: String containing all the attributes that make up this configuration instance in JSON format.
-        """
-        if use_diff is True:
-            config_dict = self.to_diff_dict(self)
-        else:
-            config_dict = self.to_dict()
-        try:
-            return json.dumps(config_dict, indent=2) + "\n"
-        except Exception as e:
-            logger.error("Failed to serialize the config to JSON string: %s", e)
-            return config_dict
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
-    @classmethod
-    @abstractmethod
-    def register_supported_configs(cls):
-        """Add all supported configs."""
-        raise NotImplementedError
-
-    @classmethod
-    def validate(self, user_config: BaseConfig):
-        # TODO validate the user config
-        pass
-
-    def __add__(self, other: BaseConfig) -> BaseConfig:
-        if isinstance(other, type(self)):
-            for op_name, config in other.local_config.items():
-                self.set_local(op_name, config)
-            return self
-        else:
-            return ComposableConfig(configs=[self, other])
-
-    @staticmethod
-    def get_the_default_value_of_param(config: BaseConfig, param: str) -> Any:
-        # Get the signature of the __init__ method
-        signature = inspect.signature(config.__init__)
-
-        # Get the parameters and their default values
-        parameters = signature.parameters
-        return parameters.get(param).default
-
-    def expand(self) -> List[BaseConfig]:
-        """Expand the config.
-
-        case 1
-            {
-                "global": { "weight_bits": [4, 6]}
-            }
-            expand to :
-            1st trial config:
-            {
-                "global": { "weight_bits": 4}
-            }
-            2nd trial config:
-            {
-                "global": { "weight_bits": 6}
-            }
-        case 2
-        # TODO to support the expansion of config with `local`
-        {
-            "global": {
-                "weight_bits": [4, 6]
-            },
-            "local":
-            {
-                "fc1":{
-                    "weight_bits": [6, 8]
-                },
-                "fc2":{
-                    "weight_bits": [4]
-                }
-            }
-
-        } -> ?
-        """
-        config_list: List[BaseConfig] = []
-        params_list = self.params_list
-        config = self
-        tuning_param_list = []
-        not_tuning_param_pair = {}  # key is the param name, value is the user specified value
-        for param in params_list:
-            # Create `tuning.TuningParam` for each param
-            # There are two cases:
-            # 1. The param is a string.
-            # 2. The param is a `tuning.TuningParam` instance.
-            if isinstance(param, str):
-                default_param = self.get_the_default_value_of_param(config, param)
-                tuning_param = TuningParam(name=param, tunable_type=List[type(default_param)])
-            elif isinstance(param, TuningParam):
-                tuning_param = param
-            else:
-                raise ValueError(f"Unsupported param type: {param}")
-            # Assign the options to the `tuning.TuningParam` instance
-            param_val = getattr(config, tuning_param.name)
-            if param_val is not None:
-                if tuning_param.is_tunable(param_val):
-                    tuning_param.options = param_val
-                    tuning_param_list.append(tuning_param)
-                else:
-                    not_tuning_param_pair[tuning_param.name] = param_val
-        logger.debug("Tuning param list: %s", tuning_param_list)
-        logger.debug("Not tuning param pair: %s", not_tuning_param_pair)
-        if len(tuning_param_list) == 0:
-            config_list = [config]
-        else:
-            tuning_param_name_lst = [tuning_param.name for tuning_param in tuning_param_list]
-            for params_values in itertools.product(*[tuning_param.options for tuning_param in tuning_param_list]):
-                tuning_param_pair = dict(zip(tuning_param_name_lst, params_values))
-                tmp_params_dict = {**not_tuning_param_pair, **tuning_param_pair}
-                new_config = self.__class__(**tmp_params_dict)
-                logger.info(new_config.to_dict())
-                config_list.append(new_config)
-        logger.info("Expanded the %s and got %d configs.", self.__class__.name, len(config_list))
-        return config_list
-
-    def _get_op_name_op_type_config(self):
-        op_type_config_dict = dict()
-        op_name_config_dict = dict()
-        for name, config in self.local_config.items():
-            if self._is_op_type(name):
-                op_type_config_dict[name] = config
-            else:
-                op_name_config_dict[name] = config
-        return op_type_config_dict, op_name_config_dict
-
-    def to_config_mapping(
-        self, config_list: Optional[List[BaseConfig]] = None, model_info: List[Tuple[str, str]] = None
-    ) -> OrderedDict[Tuple[str, str], OrderedDict[str, BaseConfig]]:
-        config_mapping = OrderedDict()
-        if config_list is None:
-            config_list = [self]
-        for config in config_list:
-            global_config = config.global_config
-            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
-            for op_name, op_type in model_info:
-                if self.global_config is not None:
-                    config_mapping[(op_name, op_type)] = global_config
-                if op_type in op_type_config_dict:
-                    config_mapping[(op_name, op_type)] = op_name_config_dict[op_type]
-                for op_name_pattern in op_name_config_dict:
-                    if isinstance(op_name, str) and re.match(op_name_pattern, op_name):
-                        config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern]
-                    elif op_name_pattern == op_name:
-                        config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern]
-        return config_mapping
-
-    @staticmethod
-    def _is_op_type(name: str) -> bool:
-        # * Ort and TF may override this method.
-        return not isinstance(name, str)
-
-    @classmethod
-    @abstractmethod
-    def get_config_set_for_tuning(cls):
-        raise NotImplementedError
-
-
-class ComposableConfig(BaseConfig):
-    name = constants.COMPOSABLE_CONFIG
-
-    def __init__(self, configs: List[BaseConfig]) -> None:
-        self.config_list = configs
-
-    def __add__(self, other: BaseConfig) -> BaseConfig:
-        if isinstance(other, type(self)):
-            self.config_list.extend(other.config_list)
-        else:
-            self.config_list.append(other)
-        return self
-
-    def to_dict(self):
-        result = {}
-        for config in self.config_list:
-            result[config.name] = config.to_dict()
-        return result
-
-    @classmethod
-    def from_dict(cls, config_dict: OrderedDict[str, Dict], config_registry: Dict[str, BaseConfig]):
-        assert len(config_dict) >= 1, "The config dict must include at least one configuration."
-        num_configs = len(config_dict)
-        name, value = next(iter(config_dict.items()))
-        config = config_registry[name].from_dict(value)
-        for _ in range(num_configs - 1):
-            name, value = next(iter(config_dict.items()))
-            config += config_registry[name].from_dict(value)
-        return config
-
-    def to_json_string(self, use_diff: bool = False) -> str:
-        return json.dumps(self.to_dict(), indent=2) + "\n"
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
-    def to_config_mapping(
-        self, config_list: List[BaseConfig] = None, model_info: Dict[str, Any] = None
-    ) -> OrderedDict[str, BaseConfig]:
-        config_mapping = OrderedDict()
-        for config in self.config_list:
-            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
-            single_config_model_info = model_info.get(config.name, None)
-            for op_name, op_type in single_config_model_info:
-                if op_type in op_type_config_dict:
-                    config_mapping[(op_name, op_type)] = op_name_config_dict[op_type]
-                for op_name_pattern in op_name_config_dict:
-                    if re.match(op_name_pattern, op_name):
-                        config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern]
-        return config_mapping
-
-    @classmethod
-    def register_supported_configs(cls):
-        """Add all supported configs."""
-        raise NotImplementedError
-
-    @classmethod
-    def get_config_set_for_tuning(cls) -> None:
-        # TODO (Yi) handle the composable config in `tuning_config`
-        return None
-
-    def get_model_info(self, model, *args, **kwargs):
-        model_info_dict = dict()
-        for config in self.config_list:
-            model_info_dict.update({config.name: config.get_model_info(model, *args, **kwargs)})
-        return model_info_dict
-
-
-def get_all_config_set_from_config_registry() -> List[BaseConfig]:
-    all_registered_config_cls: List[Type[BaseConfig]] = config_registry.get_all_config_cls()
-    config_set = []
-    for config_cls in all_registered_config_cls:
-        config_set.append(config_cls.get_config_set_for_tuning())
-    return config_set
-
-
-def register_supported_configs():
-    """Register supported configs."""
-    all_registered_config_cls: List[Type[BaseConfig]] = config_registry.get_all_config_cls()
-    for config_cls in all_registered_config_cls:
-        config_cls.register_supported_configs()
-
-
-class _OperatorConfig(NamedTuple):
-    config: BaseConfig
-    operators: List[Union[str, Callable]]
-    valid_func_list: List[Callable] = []
-
-
-######################## RNT Config ###############################
-
-
-@register_config(algo_name=constants.RTN, priority=constants.PRIORITY_RTN)
-class RTNConfig(BaseConfig):
-    """Config class for round-to-nearest weight-only quantization."""
-
-    supported_configs: List[_OperatorConfig] = []
-    params_list: List[Union[str, TuningParam]] = [
-        "weight_dtype",
-        "weight_bits",
-        "weight_group_size",
-        "weight_sym",
-        "act_dtype",
-        "accuracy_level",
-        "ratios",
-    ]
-    model_params_list: List[str] = [
-        "providers",
-        "layer_wise_quant",
-    ]
-    name: str = constants.RTN
-
-    def __init__(
-        self,
-        weight_dtype: str = "int",
-        weight_bits: int = 4,
-        weight_group_size: int = 32,
-        weight_sym: bool = True,
-        act_dtype: str = "fp32",
-        accuracy_level: int = 0,
-        ratios: dict = {},
-        providers: List[str] = ["CPUExecutionProvider"],
-        layer_wise_quant: bool = False,
-        quant_last_matmul: bool = True,
-        white_list: List[Union[str, Callable]] = constants.DEFAULT_WHITE_LIST,
-    ):
-        """Init RTN weight-only quantization config.
-
-        Args:
-            weight_dtype (str, optional): Data type for weights, default is "int".
-            weight_bits (int, optional): Number of bits used to represent weights, default is 4.
-            weight_group_size (int, optional): Size of weight groups, default is 32.
-            weight_sym (bool, optional): Indicates whether weights are symmetric, default is True.
-            act_dtype (str, optional): Data type for activations, default is "fp32".
-            accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
-                2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
-                4 (int8 compute type of jblas kernel). Defaults to 0.
-            ratios (dict, optional): percentile of clip. Defaults to {}.
-            providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"].
-            layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint.
-                Check below link for details
-                https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md,
-                default is False.
-            quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True.
-            white_list (list, optional): op in white_list will be applied current config.
-                Defaults to constants.DEFAULT_WHITE_LIST.
-        """
-        super().__init__(white_list=white_list)
-        self.weight_bits = weight_bits
-        self.weight_dtype = weight_dtype
-        self.weight_group_size = weight_group_size
-        self.weight_sym = weight_sym
-        self.act_dtype = act_dtype
-        self.accuracy_level = accuracy_level
-        self.ratios = ratios
-        self.providers = providers
-        self.layer_wise_quant = layer_wise_quant
-        self.quant_last_matmul = quant_last_matmul
-        self._post_init()
-
-    def get_model_params_dict(self):
-        result = dict()
-        for param in self.model_params_list:
-            result[param] = getattr(self, param)
-        return result
-
-    @classmethod
-    def register_supported_configs(cls) -> None:
-        supported_configs = []
-        linear_rtn_config = RTNConfig(
-            weight_dtype=["int"],
-            weight_bits=[1, 2, 3, 4, 5, 6, 7, 8],
-            weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024],
-            weight_sym=[True, False],
-            act_dtype=["fp32"],
-        )
-        operators = ["MatMul"]
-        supported_configs.append(_OperatorConfig(config=linear_rtn_config, operators=operators))
-        cls.supported_configs = supported_configs
-
-    def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: list = None):
-        config_mapping = OrderedDict()
-        if config_list is None:
-            config_list = [self]
-        for config in config_list:
-            # update model level setting
-            config_mapping.update(config.get_model_params_dict())
-
-            # update node level setting
-            global_config = config.global_config
-            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
-            for op_name, op_type in model_info:
-                if self.global_config is not None:
-                    config_mapping[(op_name, op_type)] = global_config
-                if op_type in op_type_config_dict:
-                    config_mapping[(op_name, op_type)] = op_name_config_dict[op_type]
-                for op_name_pattern in op_name_config_dict:
-                    if re.match(op_name_pattern, op_name):
-                        config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern]
-        if not self.quant_last_matmul:
-            config_mapping[model_info[-1]] = {
-                "weight": {"dtype": "fp32"},
-                "activation": {"dtype": "fp32", "quant_mode": "fp32"},
-            }
-        return config_mapping
-
-    @staticmethod
-    def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str]) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-        white_list = ["MatMul"]
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
-    @classmethod
-    def get_config_set_for_tuning(cls) -> Union[None, "RTNConfig", List["RTNConfig"]]:  # pragma: no cover
-        return RTNConfig(weight_bits=[4, 8], weight_sym=[True, False])
-
-
-def get_default_rtn_config() -> RTNConfig:
-    """Generate the default rtn config.
-
-    Returns:
-        the default rtn config.
-    """
-    return RTNConfig()
-
-
-######################## GPTQ Config ###############################
-
-
-@register_config(algo_name=constants.GPTQ, priority=constants.PRIORITY_GPTQ)
-class GPTQConfig(BaseConfig):
-    """Config class for gptq weight-only quantization."""
-
-    supported_configs: List[_OperatorConfig] = []
-    params_list: List[Union[str, TuningParam]] = [
-        "weight_dtype",
-        "weight_bits",
-        "weight_group_size",
-        "weight_sym",
-        "act_dtype",
-        "accuracy_level",
-    ]
-    model_params_list: List[Union[str, TuningParam]] = [
-        "percdamp",
-        "blocksize",
-        "actorder",
-        "mse",
-        "perchannel",
-        "providers",
-        "layer_wise_quant",
-    ]
-    name: str = constants.GPTQ
-
-    def __init__(
-        self,
-        weight_dtype: str = "int",
-        weight_bits: int = 4,
-        weight_group_size: int = 32,
-        weight_sym: bool = True,
-        act_dtype: str = "fp32",
-        accuracy_level: int = 0,
-        percdamp: float = 0.01,
-        blocksize: int = 128,
-        actorder: bool = False,
-        mse: bool = False,
-        perchannel: bool = True,
-        providers: List[str] = ["CPUExecutionProvider"],
-        layer_wise_quant: bool = False,
-        quant_last_matmul: bool = True,
-        white_list: List[Union[str, Callable]] = constants.DEFAULT_WHITE_LIST,
-    ):
-        """Init GPTQ weight-only quantization config.
-
-        Args:
-            weight_dtype (str, optional): data type for weights. Defaults to "int".
-            weight_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-            weight_group_size (int, optional): size of weight groups. Defaults to 32.
-            weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True.
-            act_dtype (str, optional): data type for activations. Defaults to "fp32".
-            accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
-                2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
-                4 (int8 compute type of jblas kernel). Defaults to 0.
-            percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added
-                to Hessian's diagonal to increase numerical stability. Defaults to 0.01.
-            blocksize (int, optional): execute GPTQ quantization per block. Defaults to 128.
-            actorder (bool, optional): whether to sort Hessian's diagonal values to rearrange channel-wise
-                quantization order. Defaults to False.
-            mse (bool, optional): whether get scale and zero point with mse error. Defaults to False.
-            perchannel (bool, optional): whether quantize weight per-channel. Defaults to True.
-            providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"].
-            layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint.
-                Check below link for details
-                https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md,
-                default is False.
-            quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True.
-            white_list (list, optional): op in white_list will be applied current config.
-                Defaults to constants.DEFAULT_WHITE_LIST.
-        """
-        super().__init__(white_list=white_list)
-        self.weight_bits = weight_bits
-        self.weight_dtype = weight_dtype
-        self.weight_group_size = weight_group_size
-        self.weight_sym = weight_sym
-        self.act_dtype = act_dtype
-        self.accuracy_level = accuracy_level
-        self.percdamp = percdamp
-        self.blocksize = blocksize
-        self.actorder = actorder
-        self.mse = mse
-        self.perchannel = perchannel
-        self.providers = providers
-        self.layer_wise_quant = layer_wise_quant
-        self.quant_last_matmul = quant_last_matmul
-        self._post_init()
-
-    def get_model_params_dict(self):
-        result = dict()
-        for param in self.model_params_list:
-            result[param] = getattr(self, param)
-        return result
-
-    @classmethod
-    def register_supported_configs(cls) -> None:
-        supported_configs = []
-        linear_gptq_config = GPTQConfig(
-            weight_dtype=["int"],
-            weight_bits=[1, 2, 3, 4, 5, 6, 7, 8],
-            weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024],
-            weight_sym=[True, False],
-            act_dtype=["fp32"],
-            actorder=[True, False],
-            mse=[True, False],
-            perchannel=[True, False],
-        )
-        operators = ["MatMul"]
-        supported_configs.append(_OperatorConfig(config=linear_gptq_config, operators=operators))
-        cls.supported_configs = supported_configs
-
-    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
-        config_mapping = OrderedDict()
-        if config_list is None:
-            config_list = [self]
-        for config in config_list:
-            # update model level setting
-            config_mapping.update(config.get_model_params_dict())
-
-            # update node level setting
-            global_config = config.global_config
-            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
-            for op_name, op_type in model_info:
-                if self.global_config is not None:
-                    config_mapping[(op_name, op_type)] = global_config
-                if op_type in op_type_config_dict:
-                    config_mapping[(op_name, op_type)] = op_name_config_dict[op_type]
-                for op_name_pattern in op_name_config_dict:
-                    if re.match(op_name_pattern, op_name):
-                        config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern]
-        if not self.quant_last_matmul:
-            config_mapping[model_info[-1]] = {
-                "weight": {"dtype": "fp32"},
-                "activation": {"dtype": "fp32", "quant_mode": "fp32"},
-            }
-        return config_mapping
-
-    @staticmethod
-    def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str]) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-        white_list = ["MatMul"]
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
-    @classmethod
-    def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig"]]:  # pragma: no cover
-        return GPTQConfig(
-            weight_bits=[4, 8],
-            weight_sym=[True, False],
-            actorder=[True, False],
-            mse=[True, False],
-            perchannel=[True, False],
-        )
-
-
-def get_default_gptq_config() -> GPTQConfig:
-    """Generate the default gptq config.
-
-    Returns:
-        the default gptq config.
-    """
-    return GPTQConfig()
-
-
-######################## AWQ Config ###############################
-
-
-@register_config(algo_name=constants.AWQ, priority=constants.PRIORITY_AWQ)
-class AWQConfig(BaseConfig):
-    """Config class for awq weight-only quantization."""
-
-    supported_configs: List[_OperatorConfig] = []
-    params_list: List[str] = [
-        "weight_dtype",
-        "weight_bits",
-        "weight_group_size",
-        "weight_sym",
-        "act_dtype",
-        "accuracy_level",
-    ]
-    model_params_list: List[str] = [
-        "enable_auto_scale",
-        "enable_mse_search",
-        "providers",
-    ]
-    name: str = constants.AWQ
-
-    def __init__(
-        self,
-        weight_dtype: str = "int",
-        weight_bits: int = 4,
-        weight_group_size: int = 32,
-        weight_sym: bool = True,
-        act_dtype: str = "fp32",
-        accuracy_level: int = 0,
-        enable_auto_scale: bool = True,
-        enable_mse_search: bool = True,
-        providers: List[str] = ["CPUExecutionProvider"],
-        quant_last_matmul: bool = True,
-        white_list: List[Union[str, Callable]] = constants.DEFAULT_WHITE_LIST,
-    ):
-        """Init AWQ weight-only quantization config.
-
-        Args:
-            weight_dtype (str, optional): data type for weights. Defaults to "int".
-            weight_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-            weight_group_size (int, optional): size of weight groups. Defaults to 32.
-            weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True.
-            act_dtype (str, optional): data type for activations. Defaults to "fp32".
-            accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
-                2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
-                4 (int8 compute type of jblas kernel). Defaults to 0.
-            enable_auto_scale (bool, optional): whether to search for best scales based on activation distribution.
-                Defaults to True.
-            enable_mse_search (bool, optional): whether to search for the best clip range from range
-                [0.91, 1.0, 0.01]. Defaults to True.
-            providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"].
-            quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True.
-            white_list (list, optional): op in white_list will be applied current config.
-                Defaults to constants.DEFAULT_WHITE_LIST.
-        """
-        super().__init__(white_list=white_list)
-        self.weight_bits = weight_bits
-        self.weight_dtype = weight_dtype
-        self.weight_group_size = weight_group_size
-        self.weight_sym = weight_sym
-        self.act_dtype = act_dtype
-        self.accuracy_level = accuracy_level
-        self.enable_auto_scale = enable_auto_scale
-        self.enable_mse_search = enable_mse_search
-        self.providers = providers
-        self.quant_last_matmul = quant_last_matmul
-        self._post_init()
-
-    def get_model_params_dict(self):
-        result = dict()
-        for param in self.model_params_list:
-            result[param] = getattr(self, param)
-        return result
-
-    @classmethod
-    def register_supported_configs(cls) -> List[_OperatorConfig]:
-        supported_configs = []
-        linear_awq_config = AWQConfig(
-            weight_dtype=["int"],
-            weight_bits=[1, 2, 3, 4, 5, 6, 7, 8],
-            weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024],
-            weight_sym=[True, False],
-            act_dtype=["fp32"],
-            enable_auto_scale=[True, False],
-            enable_mse_search=[True, False],
-        )
-        operators = ["MatMul"]
-        supported_configs.append(_OperatorConfig(config=linear_awq_config, operators=operators))
-        cls.supported_configs = supported_configs
-
-    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
-        config_mapping = OrderedDict()
-        if config_list is None:
-            config_list = [self]
-        for config in config_list:
-            # update model level setting
-            config_mapping.update(config.get_model_params_dict())
-
-            # update node level setting
-            global_config = config.global_config
-            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
-            for op_name, op_type in model_info:
-                if self.global_config is not None:
-                    config_mapping[(op_name, op_type)] = global_config
-                if op_type in op_type_config_dict:
-                    config_mapping[(op_name, op_type)] = op_name_config_dict[op_type]
-                for op_name_pattern in op_name_config_dict:
-                    if re.match(op_name_pattern, op_name):
-                        config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern]
-        if not self.quant_last_matmul:
-            config_mapping[model_info[-1]] = {
-                "weight": {"dtype": "fp32"},
-                "activation": {"dtype": "fp32", "quant_mode": "fp32"},
-            }
-        return config_mapping
-
-    @staticmethod
-    def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str]) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-        white_list = ["MatMul"]
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
-    @classmethod
-    def get_config_set_for_tuning(cls) -> Union[None, "AWQConfig", List["AWQConfig"]]:  # pragma: no cover
-        return AWQConfig(
-            weight_bits=[4, 8],
-            weight_sym=[True, False],
-            enable_auto_scale=[True, False],
-            enable_mse_search=[True, False],
-        )
-
-
-def get_default_awq_config() -> AWQConfig:
-    """Generate the default awq config.
-
-    Returns:
-        the default awq config.
-    """
-    return AWQConfig()
-
-
-######################## SmoohQuant Config ###############################
-
-
-@register_config(algo_name=constants.SMOOTH_QUANT, priority=constants.PRIORITY_SMOOTH_QUANT)
-class SmoothQuantConfig(BaseConfig, quantization.StaticQuantConfig):
-    """Smooth quant quantization config."""
-
-    supported_configs: List[_OperatorConfig] = []
-    params_list: List[str] = [
-        # smooth parameters
-        "alpha",
-        "folding",
-        "auto_alpha_args",
-        "calib_iter",
-        "scales_per_op",
-    ]
-    name: str = constants.SMOOTH_QUANT
-
-    def __init__(
-        self,
-        alpha: float = 0.5,
-        folding: bool = True,
-        op_types: List[str] = ["Gemm", "Conv", "MatMul", "FusedConv"],
-        calib_iter: int = 100,
-        scales_per_op: bool = True,
-        auto_alpha_args: dict = {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"},
-        providers: List[str] = ["CPUExecutionProvider"],
-        white_list: List[Union[str, Callable]] = constants.DEFAULT_WHITE_LIST,
-        **kwargs,
-    ):
-        """Init smooth quant config.
-
-        Args:
-            alpha (float, optional): alpha value to balance the quantization difficulty of activation and weight.
-                Defaults to 0.5.
-            folding (bool, optional): whether fold those foldable Mul which are inserted for smooth quant.
-                Defaults to True.
-            op_types (list, optional): the op type to be smooth quantized.
-                Defaults to ["Gemm", "Conv", "MatMul", "FusedConv"].
-            calib_iter (int, optional): iteration num for calibration. Defaults to 100.
-            scales_per_op (bool, optional): True, each op will have an individual scale, mainlyfor accuracy.
-                False, ops with the same input will share a scale, mainly for performance. Defaults to True.
-            auto_alpha_args (dict, optional): settings for alpha tuning.
-                Defaults to {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}.
-            providers (list, optional): providers used for inference.
-                Defaults to ["CPUExecutionProvider"].
-            white_list (list, optional): op in white_list will be applied current config.
-                Defaults to constants.DEFAULT_WHITE_LIST.
-            kwargs (dict): kwargs in below link are supported except calibration_data_reader:
-                https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/quantize.py#L78
-        """
-        BaseConfig.__init__(self)
-        kwargs.update({"calibration_data_reader": None})
-        quantization.StaticQuantConfig.__init__(self, **kwargs)
-        self.alpha = alpha
-        self.folding = folding
-        self.op_types = op_types
-        self.calib_iter = calib_iter
-        self.scales_per_op = scales_per_op
-        self.auto_alpha_args = auto_alpha_args
-        self.providers = providers
-        self.white_list = white_list
-        self.weight_type = self.weight_type.value if isinstance(self.weight_type, enum.Enum) else self.weight_type
-        self.activation_type = (
-            self.activation_type.value if isinstance(self.activation_type, enum.Enum) else self.activation_type
-        )
-        self.calibrate_method = (
-            self.calibrate_method.value if isinstance(self.calibrate_method, enum.Enum) else self.calibrate_method
-        )
-        self.quant_format = self.quant_format.value if isinstance(self.quant_format, enum.Enum) else self.quant_format
-        self._post_init()
-
-    @classmethod
-    def register_supported_configs(cls) -> List[_OperatorConfig]:
-        supported_configs = []
-        smooth_quant_config = SmoothQuantConfig()
-        operators = ["Gemm", "Conv", "MatMul", "FusedConv"]
-        supported_configs.append(_OperatorConfig(config=smooth_quant_config, operators=operators))
-        cls.supported_configs = supported_configs
-
-    @staticmethod
-    def get_model_info(model) -> list:
-        white_list = ["Gemm", "Conv", "MatMul", "FusedConv"]
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
-    @classmethod
-    def get_config_set_for_tuning(
-        cls,
-    ) -> Union[None, "SmoothQuantConfig", List["SmoothQuantConfig"]]:  # pragma: no cover
-        return SmoothQuantConfig(alpha=np.arange(0.3, 0.7, 0.05))
-
-    def convert_to_ort_config(self):
-        self.activation_type = quantization.QuantType(self.activation_type)
-        self.weight_type = quantization.QuantType(self.weight_type)
-        self.weight_type = quantization.QuantType(self.weight_type)
-        self.calibrate_method = quantization.CalibrationMethod(self.calibrate_method)
-        self.quant_format = quantization.QuantFormat(self.quant_format)
-
-
-def get_default_sq_config() -> SmoothQuantConfig:
-    """Generate the default smooth quant config.
-
-    Returns:
-        the default smooth quant config.
-    """
-    return SmoothQuantConfig()
-
-
-######################## WOQ Tuning Config ###############################
-
-
-def get_woq_tuning_config() -> list:
-    """Generate the config set for WOQ tuning.
-
-    Returns:
-        the list of WOQ quant config.
-    """
-    RTN_G32ASYM = RTNConfig(weight_sym=False)
-    GPTQ_G32ASYM = GPTQConfig(weight_sym=False)
-    GPTQ_G32ASYM_DISABLE_LAST_MATMUL = GPTQConfig(weight_sym=False, quant_last_matmul=False)
-    GPTQ_G128ASYM = GPTQConfig(weight_group_size=128, weight_sym=False)
-    AWQ_G32ASYM = AWQConfig(weight_sym=False)
-    return [RTN_G32ASYM, GPTQ_G32ASYM, GPTQ_G32ASYM_DISABLE_LAST_MATMUL, GPTQ_G128ASYM, AWQ_G32ASYM]
-
-
-##################### INC Algo Configs End ###################################
-
-register_supported_configs()
-
-##################### Config for ONNXRuntime-like user-facing API ############
-
-
-class StaticQuantConfig(quantization.StaticQuantConfig):
-
-    def __init__(self, calibration_data_reader: data_reader.CalibrationDataReader, extra_options=None, *args, **kwargs):
-        """This is a class for static Quant Configuration.
-
-        Inherit from StaticQuantConfig:
-        https://github.com/microsoft/onnxruntime/blob/v1.17.1/onnxruntime/python/tools/quantization/quantize.py#L78
-        extra_options:
-            Support smoothquant args.
-            - SmoothQuant = True/False :
-                Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
-                fake input channel quantization.
-            - SmoothQuantAlpha = float :
-                Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
-                and activation quantization. A larger alpha value could be used on models with more significant
-                activation outliers to migrate more quantization difficulty to weights.
-            - SmoothQuantFolding = True/False :
-                Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
-                SmoothQuant will be folded into the previous op if the previous op is foldable.
-            - SmoothQuantOpTypes = list (new args):
-                Default is ["Gemm", "Conv", "MatMul", "FusedConv"]. It only works if SmoothQuant is True.
-                It controls the op types to be smooth quantized.
-            - SmoothQuantCalibIter = int (new args):
-                Default is 100. It only works if SmoothQuant is True. It controls the iteration num for calibration.
-            - SmoothQuantScalesPerOp = True/False (new args) :
-                Default is True. It only works if SmoothQuant is True.
-                If enabled, each op will have an individual scale, mainlyfor accuracy.
-                If not enabled,  ops with the same input will share a scale, mainly for performance.
-        """
-        super().__init__(calibration_data_reader=calibration_data_reader, extra_options=extra_options, *args, **kwargs)
-
-    def to_dict(self):
-        return self.__dict__
-
-
-class DynamicQuantConfig(quantization.DynamicQuantConfig):
-    """This is a class for dynamic Quant Configuration.
-
-    Inherit from DynamicQuantConfig:
-        https://github.com/microsoft/onnxruntime/blob/v1.17.1/onnxruntime/python/tools/quantization/quantize.py#L206
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-
-def generate_nc_sq_config(quant_config: quantization.StaticQuantConfig):
-    extra_options = quant_config.extra_options
-    quant_kwargs = {
-        "alpha": extra_options.get("SmoothQuantAlpha", 0.5),
-        "folding": extra_options.get("SmoothQuantFolding", True),
-        "op_types": extra_options.get("SmoothQuantOpTypes", ["Gemm", "Conv", "MatMul", "FusedConv"]),
-        "calib_iter": extra_options.get("SmoothQuantCalibIter", 100),
-        "scales_per_op": extra_options.get("SmoothQuantScalesPerOp", True),
-    }
-    quant_config.extra_options["SmoothQuant"] = False
-    quant_config_dict = quant_config.to_dict()
-    nc_sq_config = SmoothQuantConfig(**quant_kwargs, **quant_config_dict)
-    return nc_sq_config
diff --git a/onnx_neural_compressor/constants.py b/onnx_neural_compressor/constants.py
index d2e0391c6..71caf2a49 100644
--- a/onnx_neural_compressor/constants.py
+++ b/onnx_neural_compressor/constants.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-#
 # Copyright (c) 2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -30,6 +28,7 @@
 COMPOSABLE_CONFIG = "composable_config"
 RTN = "rtn"
 STATIC_QUANT = "static_quant"
+DYNAMIC_QUANT = "dynamic_quant"
 SMOOTH_QUANT = "smooth_quant"
 GPTQ = "gptq"
 AWQ = "awq"
@@ -44,7 +43,283 @@
 PRIORITY_GPTQ = 70
 PRIORITY_AWQ = 50
 PRIORITY_SMOOTH_QUANT = 80
+PRIORITY_STATIC_QUANT = 70
+PRIORITY_DYNAMIC_QUANT = 60
 
 MAXIMUM_PROTOBUF = 2147483648
 
 WHITE_MODULE_LIST = ["MatMul", "Conv"]
+
+RTN_OP_LIST = ["MatMul"]
+
+AWQ_OP_LIST = ["MatMul"]
+
+GPTQ_OP_LIST = ["MatMul"]
+
+DYNAMIC_CPU_OP_LIST = ["FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM"]
+DYNAMIC_CUDA_OP_LIST = ["FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM"]
+DYNAMIC_DML_OP_LIST = []
+DYNAMIC_DNNL_OP_LIST = ["FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM"]
+DYNAMIC_TRT_OP_LIST = []
+
+STATIC_QDQ_CPU_OP_LIST = [
+    "FusedConv",
+    "Conv",
+    "Gather",
+    "GatherElements",
+    "GatherND",
+    "Tile",
+    "MatMul",
+    "Gemm",
+    "EmbedLayerNormalization",
+    "Attention",
+    "Relu",
+    "Clip",
+    "LeakyRelu",
+    "Sigmoid",
+    "MaxPool",
+    "GlobalAveragePool",
+    "Pad",
+    "Split",
+    "Squeeze",
+    "Reshape",
+    "Concat",
+    "AveragePool",
+    "Unsqueeze",
+    "Transpose",
+    "Resize",
+    "Abs",
+    "Shrink",
+    "Sign",
+    "Flatten",
+    "Expand",
+    "Slice",
+    "Mod",
+    "ReduceMax",
+    "ReduceMin",
+    "CenterCropPad",
+]
+STATIC_QDQ_CUDA_OP_LIST = [
+    "FusedConv",
+    "Conv",
+    "Gather",
+    "MatMul",
+    "Gemm",
+    "EmbedLayerNormalization",
+    "Attention",
+    "Relu",
+    "Clip",
+    "LeakyRelu",
+    "Sigmoid",
+    "MaxPool",
+    "GlobalAveragePool",
+    "Pad",
+    "Split",
+    "Squeeze",
+    "Reshape",
+    "Concat",
+    "AveragePool",
+    "Unsqueeze",
+    "Transpose",
+    "Resize",
+    "Abs",
+    "Shrink",
+    "Sign",
+    "Flatten",
+    "Expand",
+    "Slice",
+    "Mod",
+    "ReduceMax",
+    "ReduceMin",
+]
+STATIC_QDQ_DML_OP_LIST = [
+    "Conv",
+    "MatMul",
+    "Relu",
+    "Clip",
+    "MaxPool",
+]
+STATIC_QDQ_DNNL_OP_LIST = [
+    "FusedConv",
+    "Conv",
+    "Gather",
+    "MatMul",
+    "Gemm",
+    "EmbedLayerNormalization",
+    "Attention",
+    "Relu",
+    "Clip",
+    "LeakyRelu",
+    "Sigmoid",
+    "MaxPool",
+    "GlobalAveragePool",
+    "Pad",
+    "Split",
+    "Squeeze",
+    "Reshape",
+    "Concat",
+    "AveragePool",
+    "Unsqueeze",
+    "Transpose",
+    "Resize",
+]
+STATIC_QDQ_TRT_OP_LIST = [
+    "Conv",
+    "MatMul",
+    "Attention",
+    "LeakyRelu",
+    "Gather",
+    "Sigmoid",
+    "MaxPool",
+    "EmbedLayerNormalization",
+    "GlobalAveragePool",
+    "Pad",
+    "Split",
+    "Squeeze",
+    "Reshape",
+    "Concat",
+    "AveragePool",
+    "Unsqueeze",
+    "Transpose",
+    "Resize",
+    "Gemm",
+    "Add",
+]
+
+STATIC_QOPERATOR_CPU_OP_LIST = [
+    "FusedConv",
+    "Conv",
+    "Gather",
+    "GatherElements",
+    "GatherND",
+    "Tile",
+    "MatMul",
+    "Gemm",
+    "EmbedLayerNormalization",
+    "Attention",
+    "Mul",
+    "Relu",
+    "Clip",
+    "LeakyRelu",
+    "Sigmoid",
+    "MaxPool",
+    "GlobalAveragePool",
+    "Pad",
+    "Split",
+    "Add",
+    "Squeeze",
+    "Reshape",
+    "Concat",
+    "AveragePool",
+    "Unsqueeze",
+    "Transpose",
+    "ArgMax",
+    "Resize",
+    "Abs",
+    "Shrink",
+    "Sign",
+    "Flatten",
+    "Expand",
+    "Slice",
+    "Mod",
+    "ReduceMax",
+    "ReduceMin",
+    "CenterCropPad",
+]
+STATIC_QOPERATOR_CUDA_OP_LIST = [
+    "FusedConv",
+    "Conv",
+    "Gather",
+    "MatMul",
+    "Gemm",
+    "EmbedLayerNormalization",
+    "Attention",
+    "Mul",
+    "Relu",
+    "Clip",
+    "LeakyRelu",
+    "Sigmoid",
+    "MaxPool",
+    "GlobalAveragePool",
+    "Pad",
+    "Split",
+    "Add",
+    "Squeeze",
+    "Reshape",
+    "Concat",
+    "AveragePool",
+    "Unsqueeze",
+    "Transpose",
+    "ArgMax",
+    "Resize",
+    "Abs",
+    "Shrink",
+    "Sign",
+    "Flatten",
+    "Expand",
+    "Slice",
+    "Mod",
+    "ReduceMax",
+    "ReduceMin",
+]
+STATIC_QOPERATOR_DML_OP_LIST = [
+    "Conv",
+    "MatMul",
+    "Mul",
+    "Relu",
+    "Clip",
+    "MaxPool",
+    "Add",
+]
+STATIC_QOPERATOR_DNNL_OP_LIST = [
+    "FusedConv",
+    "Conv",
+    "Gather",
+    "MatMul",
+    "Gemm",
+    "EmbedLayerNormalization",
+    "Attention",
+    "Mul",
+    "Relu",
+    "Clip",
+    "LeakyRelu",
+    "Sigmoid",
+    "MaxPool",
+    "GlobalAveragePool",
+    "Pad",
+    "Split",
+    "Add",
+    "Squeeze",
+    "Reshape",
+    "Concat",
+    "AveragePool",
+    "Unsqueeze",
+    "Transpose",
+    "ArgMax",
+    "Resize",
+]
+STATIC_QOPERATOR_TRT_OP_LIST = []
+
+STATIC_QOPERATOR_OP_LIST_MAP = {
+    "CPUExecutionProvider": STATIC_QOPERATOR_CPU_OP_LIST,
+    "CUDAExecutionProvider": STATIC_QOPERATOR_CUDA_OP_LIST,
+    "DmlExecutionProvider": STATIC_QOPERATOR_DML_OP_LIST,
+    "DnnlExecutionProvider": STATIC_QOPERATOR_DNNL_OP_LIST,
+    "TensorrtExecutionProvider": STATIC_QOPERATOR_TRT_OP_LIST,
+}
+
+STATIC_QDQ_OP_LIST_MAP = {
+    "CPUExecutionProvider": STATIC_QDQ_CPU_OP_LIST,
+    "CUDAExecutionProvider": STATIC_QDQ_CUDA_OP_LIST,
+    "DmlExecutionProvider": STATIC_QDQ_DML_OP_LIST,
+    "DnnlExecutionProvider": STATIC_QDQ_DNNL_OP_LIST,
+    "TensorrtExecutionProvider": STATIC_QDQ_TRT_OP_LIST,
+}
+
+DYNAMIC_OP_LIST_MAP = {
+    "CPUExecutionProvider": DYNAMIC_CPU_OP_LIST,
+    "CUDAExecutionProvider": DYNAMIC_CUDA_OP_LIST,
+    "DmlExecutionProvider": DYNAMIC_DML_OP_LIST,
+    "DnnlExecutionProvider": DYNAMIC_DNNL_OP_LIST,
+    "TensorrtExecutionProvider": DYNAMIC_TRT_OP_LIST,
+}
diff --git a/onnx_neural_compressor/data_reader.py b/onnx_neural_compressor/data_reader.py
index 24538ce55..7f76769f0 100644
--- a/onnx_neural_compressor/data_reader.py
+++ b/onnx_neural_compressor/data_reader.py
@@ -1,7 +1,4 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 Intel Corporation
+# Copyright (c) 2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,15 +14,25 @@
 
 import abc
 
-from onnxruntime import quantization
 
+class CalibrationDataReader(metaclass=abc.ABCMeta):
+    @classmethod
+    def __subclasshook__(cls, subclass):
+        return hasattr(subclass, "get_next") and callable(subclass.get_next) or NotImplemented
+
+    @abc.abstractmethod
+    def get_next(self) -> dict:
+        """generate the input data dict for ONNXinferenceSession run"""
+        raise NotImplementedError
 
-class CalibrationDataReader(quantization.CalibrationDataReader):
-    """Get data for calibration.
+    def __iter__(self):
+        return self
 
-    We define our CalibrationDataReader based on the class in below link:
-    https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py#L139
-    """
+    def __next__(self):
+        result = self.get_next()
+        if result is None:
+            raise StopIteration
+        return result
 
     @abc.abstractmethod
     def rewind(self):
diff --git a/onnx_neural_compressor/onnx_model.py b/onnx_neural_compressor/onnx_model.py
index 061f7cad8..c1661f85e 100644
--- a/onnx_neural_compressor/onnx_model.py
+++ b/onnx_neural_compressor/onnx_model.py
@@ -21,12 +21,11 @@
 
 import onnx
 import transformers
-from onnxruntime.quantization import onnx_model
 
 from onnx_neural_compressor import constants, logger, utility
 
 
-class ONNXModel(onnx_model.ONNXModel):
+class ONNXModel:
     """Build ONNX model."""
 
     def __init__(self, model, **kwargs):
@@ -36,27 +35,69 @@ def __init__(self, model, **kwargs):
             model (str or ModelProto): path to onnx model or loaded ModelProto model object.
         """
         self.model = model if not isinstance(model, str) else onnx.load(model, load_external_data=False)
-        super().__init__(self.model)
-
         self._model_path = None if not isinstance(model, str) else model
         self.check_is_large_model()
         if self._is_large_model and self._model_path is None and not kwargs.get("ignore_warning", False):
             logger.warning("Model size > 2GB. Please use model path instead of onnx model object to quantize")
 
         if self._is_large_model and isinstance(model, str) and kwargs.get("load_external_data", True):
-
             onnx.external_data_helper.load_external_data_for_model(self.model, os.path.dirname(self._model_path))
 
         self._config = None
         if isinstance(model, str) and os.path.exists(pathlib.Path(model).parent.joinpath("config.json").as_posix()):
             self._config = transformers.PretrainedConfig.from_pretrained(pathlib.Path(model).parent.as_posix())
         self.node_name_counter = {}
-        self._output_name_to_node = self.output_name_to_node()
-        self._input_name_to_nodes = self.input_name_to_nodes()
+        self._output_name_to_node = {}
+        self._input_name_to_nodes = {}
+        self._get_output_name_to_node(self.model.graph.node)
+        self._get_input_name_to_nodes(self.model.graph.node)
         self._graph_info = {}
         self._get_graph_info()
         self._q_config = None
 
+    def output_name_to_node(self):
+        self._output_name_to_node = {}
+        self._get_output_name_to_node(self.model.graph.node)
+        return self._output_name_to_node
+
+    def input_name_to_nodes(self):
+        self._input_name_to_nodes = {}
+        self._get_input_name_to_nodes(self.model.graph.node)
+        return self._input_name_to_nodes
+
+    def _get_input_name_to_nodes(self, nodes):
+        """Get input names of nodes."""
+        for node in nodes:
+            attrs = [
+                attr
+                for attr in node.attribute
+                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+            ]
+            if len(attrs) > 0:
+                for attr in attrs:
+                    self._get_input_name_to_nodes(attr.g.node)
+            for input_name in node.input:
+                if len(input_name.strip()) != 0:
+                    if input_name not in self._input_name_to_nodes:
+                        self._input_name_to_nodes[input_name] = [node]
+                    else:
+                        self._input_name_to_nodes[input_name].append(node)
+
+    def _get_output_name_to_node(self, nodes):
+        """Get output names of nodes."""
+        for node in nodes:
+            attrs = [
+                attr
+                for attr in node.attribute
+                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+            ]
+            if len(attrs) > 0:
+                for attr in attrs:
+                    self._get_output_name_to_node(attr.g.node)
+            for output_name in node.output:
+                if len(output_name.strip()) != 0:
+                    self._output_name_to_node[output_name] = node
+
     @property
     def model_path(self):
         """Return model path."""
@@ -99,6 +140,11 @@ def framework(self):
         """Return framework."""
         return "onnxruntime"
 
+    def add_initializer(self, tensor):
+        """Add a initializer to model."""
+        if tensor.name not in [i.name for i in self._model.graph.initializer]:
+            self._model.graph.initializer.append(tensor)
+
     def add_initializers(self, tensors):
         """Add initializers to model."""
         for tensor in tensors:
@@ -127,6 +173,42 @@ def output(self):
         """Return output of model."""
         return [i.name for i in self.model.graph.output]
 
+    @property
+    def model(self):
+        """Return model itself."""
+        return self._model
+
+    @model.setter
+    def model(self, model):
+        """Set model itself."""
+        self._model = model
+        self._graph_info = {}
+        self._get_graph_info()
+        self._output_name_to_node = {}
+        self._input_name_to_nodes = {}
+        self._get_input_name_to_nodes(self._model.graph.node)
+        self._get_output_name_to_node(self._model.graph.node)
+
+    def nodes(self):
+        """Return model nodes."""
+        return self._model.graph.node
+
+    def initializer(self):
+        """Return model initializer."""
+        return self._model.graph.initializer
+
+    def graph(self):
+        """Return model graph."""
+        return self._model.graph
+
+    def ir_version(self):
+        """Return model ir_version."""
+        return self._model.ir_version
+
+    def opset_import(self):
+        """Return model opset_import."""
+        return self._model.opset_import
+
     def update(self):
         """Update model info."""
         self._graph_info = {}
@@ -144,6 +226,10 @@ def _get_graph_info(self):
         for node in self.model.graph.node:
             self.graph_info.update({node.name: node.op_type})
 
+    def is_graph_output(self, name):
+        """Check whether the tensor is the graph output."""
+        return name in self.output()
+
     def save(self, root):
         """Save ONNX model."""
         if os.path.split(root)[0] != "" and not os.path.exists(os.path.split(root)[0]):
@@ -168,6 +254,53 @@ def save(self, root):
             output_config_file = pathlib.Path(root).parent.joinpath("config.json").as_posix()
             self._config.to_json_file(output_config_file, use_diff=False)
 
+    def remove_initializer(self, tensor):
+        """Remove an initializer from model."""
+        if tensor in self._model.graph.initializer:
+            self._model.graph.initializer.remove(tensor)
+
+    def remove_initializers(self, init_to_remove):
+        """Remove initializers from model."""
+        for initializer in init_to_remove:
+            self.remove_initializer(initializer)
+
+    def get_initializer(self, name):
+        """ "Find the initializer with specified name."""
+        for initializer in self.model.graph.initializer:
+            if initializer.name == name:
+                return initializer
+        return None
+
+    def remove_node(self, node):
+        """Remove a node from model."""
+        if node in self._model.graph.node:
+            self._model.graph.node.remove(node)
+
+    def remove_nodes(self, nodes_to_remove):
+        """Remove nodes from model."""
+        for node in nodes_to_remove:
+            self.remove_node(node)
+
+    def add_node(self, node):
+        """Add a node to model."""
+        self._model.graph.node.extend([node])
+
+    def add_nodes(self, nodes_to_add):
+        """Add nodes to model."""
+        self._model.graph.node.extend(nodes_to_add)
+
+    def get_children(self, node, input_name_to_nodes=None):
+        """Get children nodes."""
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self._input_name_to_nodes
+
+        children = []
+        for output in node.output:
+            if output in input_name_to_nodes:
+                for child in input_name_to_nodes[output]:
+                    children.append(child)
+        return children
+
     def get_initializer_share_num(self, name):
         """Get the number of shares of initializer."""
         num = 0
@@ -186,6 +319,25 @@ def get_node(self, name):
                 return node
         return None
 
+    def get_parent(self, node, idx, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self._output_name_to_node
+        if len(node.input) <= idx:
+            return None
+
+        input = node.input[idx]
+        return output_name_to_node.get(input, None)
+
+    def get_parents(self, node, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self._output_name_to_node
+
+        parents = []
+        for input in node.input:
+            if input in output_name_to_node:
+                parents.append(output_name_to_node[input])
+        return parents
+
     def get_node_by_weight(self, weight_name):
         """Get a node by its weight name."""
         if len(self._input_name_to_nodes) == 0:
@@ -277,6 +429,22 @@ def _searcher(tensor_name):
             assert zo_tensor, "missing zero point for tensor {}".format(tensor)
             return scale_tensor, zo_tensor
 
+    @staticmethod
+    def replace_node_input(node, old_input_name, new_input_name):
+        """Replace input of a node."""
+        assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
+        for j in range(len(node.input)):
+            if node.input[j] == old_input_name:
+                node.input[j] = new_input_name
+
+    @staticmethod
+    def replace_node_output(node, old_output_name, new_output_name):
+        """Replace output of a node."""
+        assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
+        for j in range(len(node.output)):
+            if node.output[j] == old_output_name:
+                node.output[j] = new_output_name
+
     def replace_input_of_all_nodes(self, old_input_name, new_input_name, white_optype=[], black_optype=[]):
         """Replace inputs of all nodes."""
         if len(white_optype) > 0:
@@ -299,10 +467,21 @@ def replace_output_of_all_nodes(self, old_output_name, new_output_name, white_op
                 if node.op_type not in black_optype:
                     ONNXModel.replace_node_output(node, old_output_name, new_output_name)
 
+    def remove_duplicate_nodes(self):
+        """remove duplicate nodes"""
+        new_nodes = []
+        for node in self.nodes():
+            if node not in new_nodes:
+                new_nodes.append(node)
+        self.model.graph.ClearField("node")
+        self.model.graph.node.extend(new_nodes)
+        self.update()
+
     def remove_unused_nodes(self):
         """Remove unused nodes."""
         unused_nodes = []
         nodes = self.nodes()
+
         if len(self._input_name_to_nodes) == 0:
             self._input_name_to_nodes = self.input_name_to_nodes()
         if len(self._output_name_to_node) == 0:
@@ -314,35 +493,26 @@ def remove_unused_nodes(self):
                 and node.output[0] not in self._input_name_to_nodes
             ):
                 unused_nodes.append(node)
-            elif (
-                node.op_type == "QuantizeLinear"
-                and len(self.get_children(node)) == 1
-                and self.get_children(node)[0].op_type == "DequantizeLinear"
-                and node.input[0] not in self._output_name_to_node
-                and self.get_children(node)[0].output[0] not in self._input_name_to_nodes
-            ):
-                unused_nodes.append(node)
-                unused_nodes.extend(self.get_children(node))
-            else:
-                # remove the node if it does not serve as the input or output of any other nodes
-                unused = True
-                for output in node.output:
-                    if output in self._input_name_to_nodes or output in self.output():
-                        unused = False
-                        break
-                for input in node.input:
-                    if self.get_initializer(input) is not None:
-                        continue
-                    elif input in self._output_name_to_node or input in self.input():
-                        unused = False
-                        break
-                if unused:
-                    unused_nodes.append(node)
+
         self.remove_nodes(unused_nodes)
 
+        unvalid_nodes = [
+            i
+            for i in self.model.graph.node
+            if all(out not in self._input_name_to_nodes and out not in self.output() for out in i.output)
+        ]
+        while len(unvalid_nodes) > 0:
+            self.remove_nodes(unvalid_nodes)
+            self._input_name_to_nodes = self.input_name_to_nodes()
+            unvalid_nodes = [
+                i
+                for i in self.model.graph.node
+                if all([out not in self._input_name_to_nodes and out not in self.output() for out in i.output])
+            ]
+
         ununsed_weights = []
         for w in self.model.graph.initializer:
-            if w.name not in self._input_name_to_nodes and w.name not in self.model.graph.output:
+            if w.name not in self._input_name_to_nodes and w.name not in self.output():
                 ununsed_weights.append(w)
                 # Remove from graph.input
                 for graph_input in self.graph().input:
@@ -351,6 +521,7 @@ def remove_unused_nodes(self):
 
         self.remove_initializers(ununsed_weights)
         self.update()
+        self.topological_sort()
 
     def topological_sort(self, enable_subgraph=False):
         """Topological sort the model."""
@@ -403,43 +574,6 @@ def topological_sort(self, enable_subgraph=False):
         self.model.graph.ClearField("node")
         self.model.graph.node.extend(nodes)
 
-    def get_nodes_chain(self, start, stop, result_chain=[]):
-        """Get nodes chain with given start node and stop node."""
-        # process start node list
-        start_node = collections.deque()
-        for node in start:
-            if isinstance(node, str):
-                start_node.append(node)
-            elif isinstance(node, onnx.NodeProto):
-                start_node.append(node.name)
-            else:
-                assert False, "'get_nodes_chain' function only support list[string]" "or list[NodeProto] params"
-
-        # process stop node list
-        stop_node = []
-        for node in stop:
-            if isinstance(node, str):
-                stop_node.append(node)
-            elif isinstance(node, onnx.NodeProto):
-                stop_node.append(node.name)
-            else:
-                assert False, "'get_nodes_chain' function only support list[string]" "or list[NodeProto] params"
-
-        while start_node:
-            node_name = start_node.popleft()
-            if node_name in stop_node:
-                continue
-            if node_name not in result_chain:
-                result_chain.append(node_name)
-            else:
-                continue
-
-            node = utility.find_by_name(node_name, list(self.model.graph.node))
-            for parent in self.get_parents(node):
-                start_node.append(parent.name)
-
-        return result_chain
-
     def find_split_node_for_layer_wise_quantization(self):
         """Find split node for layer wise quantization."""
         # find split nodes of decoder blocks
@@ -800,22 +934,7 @@ def split_model_with_node(self, split_node_name, path_of_model_to_split, save_bo
         # origin model : ... -> node_1 -> split_node -> node_2 -> ...
         # split model 1: ... -> node_1 -> split_node
         # split model 2: node_2 -> ...
-
-        # remove nodes which are not followed by other nodes
-        unvalid_nodes = [
-            i
-            for i in self.model.graph.node
-            if all(out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output)
-        ]
-        while len(unvalid_nodes) > 0:
-            self.remove_nodes(unvalid_nodes)
-            self._input_name_to_nodes = self.input_name_to_nodes()
-            unvalid_nodes = [
-                i
-                for i in self.model.graph.node
-                if all([out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output])
-            ]
-        self.topological_sort()
+        self.remove_unused_nodes()
 
         split_model_part_1 = onnx.ModelProto()
         split_model_part_1.CopyFrom(self.model)
diff --git a/onnx_neural_compressor/quantization/__init__.py b/onnx_neural_compressor/quantization/__init__.py
index 7ef91659a..67e82f0fc 100644
--- a/onnx_neural_compressor/quantization/__init__.py
+++ b/onnx_neural_compressor/quantization/__init__.py
@@ -12,7 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-from onnxruntime.quantization.quant_utils import QuantFormat, QuantType
-
+from onnx_neural_compressor.quantization.quant_utils import CalibrationMethod, QuantFormat, QuantType
 from onnx_neural_compressor.quantization.quantize import quantize
diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py
index cd079932c..12689fa7e 100644
--- a/onnx_neural_compressor/quantization/algorithm_entry.py
+++ b/onnx_neural_compressor/quantization/algorithm_entry.py
@@ -17,11 +17,148 @@
 from typing import Union
 
 import onnx
-from onnxruntime import quantization
+import onnxruntime as ort
 
-from onnx_neural_compressor import config, constants, data_reader, logger, utility
+from onnx_neural_compressor import constants, data_reader, logger, utility
+from onnx_neural_compressor.algorithms.post_training_quant import calibrate, quantizer
 from onnx_neural_compressor.algorithms.smoother import core
 from onnx_neural_compressor.algorithms.weight_only import awq, gptq, rtn
+from onnx_neural_compressor.quantization import config
+
+
+###################### RTN Algo Entry ##################################
+@utility.register_algo(name=constants.RTN)
+def rtn_quantize_entry(
+    model: Union[pathlib.Path, str], quant_config: config.RTNConfig, *args, **kwargs
+) -> onnx.ModelProto:
+    """The main entry to apply rtn quantization."""
+    if len(quant_config.config_mapping) == 0:
+        # map config to each op
+        model_info = config.RTNConfig.get_model_info(model=model)
+        config_mapping = quant_config.to_config_mapping(model_info=model_info)
+        logger.debug(config_mapping)
+    else:
+        config_mapping = quant_config.config_mapping
+    quant_kwargs = {}
+    quant_kwargs = {key: getattr(quant_config, key) for key in config.RTNConfig.model_params_list}
+    model = rtn.apply_rtn_on_model(model, config_mapping, **quant_kwargs)
+    return model
+
+
+###################### GPTQ Algo Entry ##################################
+@utility.register_algo(name=constants.GPTQ)
+def gptq_quantize_entry(
+    model: Union[pathlib.Path, str],
+    quant_config: config.GPTQConfig,
+    calibration_data_reader: data_reader.CalibrationDataReader,
+    *args,
+    **kwargs,
+) -> onnx.ModelProto:
+    """The main entry to apply gptq quantization."""
+    assert calibration_data_reader is not None, "Please provide calibration_data_reader"
+    assert isinstance(
+        calibration_data_reader, data_reader.CalibrationDataReader
+    ), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader"
+
+    if len(quant_config.config_mapping) == 0:
+        # map config to each op
+        model_info = config.GPTQConfig.get_model_info(model=model)
+        config_mapping = quant_config.to_config_mapping(model_info=model_info)
+        logger.debug(config_mapping)
+    else:
+        config_mapping = quant_config.config_mapping
+    quant_kwargs = {}
+    quant_kwargs = {key: getattr(quant_config, key) for key in config.GPTQConfig.model_params_list}
+
+    # regenerate to ensure data exists
+    calibration_data_reader.rewind()
+    model = gptq.apply_gptq_on_model(model, config_mapping, calibration_data_reader, **quant_kwargs)
+    return model
+
+
+###################### AWQ Algo Entry ##################################
+@utility.register_algo(name=constants.AWQ)
+def awq_quantize_entry(
+    model: Union[pathlib.Path, str],
+    quant_config: config.AWQConfig,
+    calibration_data_reader: data_reader.CalibrationDataReader,
+    *args,
+    **kwargs,
+) -> onnx.ModelProto:
+    """The main entry to apply awq quantization."""
+    assert calibration_data_reader is not None, "Please provide calibration_data_reader"
+    assert isinstance(
+        calibration_data_reader, data_reader.CalibrationDataReader
+    ), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader"
+
+    if len(quant_config.config_mapping) == 0:
+        # map config to each op
+        model_info = config.AWQConfig.get_model_info(model=model)
+        config_mapping = quant_config.to_config_mapping(model_info=model_info)
+        logger.debug(config_mapping)
+    else:
+        config_mapping = quant_config.config_mapping
+    quant_kwargs = {}
+    quant_kwargs = {key: getattr(quant_config, key) for key in config.AWQConfig.model_params_list}
+
+    # regenerate to ensure data exists
+    calibration_data_reader.rewind()
+    model = awq.apply_awq_on_model(model, config_mapping, calibration_data_reader, **quant_kwargs)
+    return model
+
+
+###################### Static quant Entry ##################################
+@utility.register_algo(name=constants.STATIC_QUANT)
+def static_quantize_entry(
+    model: Union[pathlib.Path, str],
+    quant_config: config.StaticQuantConfig,
+    calibration_data_reader: data_reader.CalibrationDataReader,
+    model_output: Union[pathlib.Path, str] = None,
+    *args,
+    **kwargs,
+) -> onnx.ModelProto:
+    """The main entry to apply dynamic quantization."""
+    if len(quant_config.op_types_to_quantize) == 0:
+        logger.warning("No candidate op type to do quantization, exit.")
+        exit(0)
+    assert calibration_data_reader is not None, "Please provide calibration_data_reader"
+    assert isinstance(
+        calibration_data_reader, data_reader.CalibrationDataReader
+    ), "Please follow onnx_neural_compressor/quantization/calibrate.py to implement calibration_data_reader"
+
+    if len(quant_config.config_mapping) == 0:
+        # map config to each op
+        model_info = config.StaticQuantConfig.get_model_info(model=model)
+        config_mapping = quant_config.to_config_mapping(model_info=model_info)
+        logger.debug(config_mapping)
+    else:
+        config_mapping = quant_config.config_mapping
+
+    calibration_data_reader.rewind()
+    augment = calibrate.ONNXRTAugment(
+        model,
+        calibration_data_reader,
+        dump_op_types=quant_config.op_types_to_quantize,
+        execution_provider=quant_config.execution_provider,
+        iterations=list(range(0, quant_config.calibration_sampling_size)),
+    )
+    min_max = augment.dump_minmax(config_mapping)
+    quantize_params = augment.dump_calibration(config_mapping, min_max=min_max)
+    _quantizer = quantizer.StaticQuantizer(
+        model,
+        config_mapping,
+        quant_format=quant_config.quant_format.name.lower(),
+        quantization_params=quantize_params,
+        op_types_to_quantize=quant_config.op_types_to_quantize,
+        execution_provider=quant_config.execution_provider,
+        optypes_to_exclude_output_quant=quant_config.optypes_to_exclude_output_quant,
+        dedicated_qdq_pair=quant_config.dedicated_qdq_pair,
+        add_qdq_pair_to_weight=quant_config.add_qdq_pair_to_weight,
+    )
+    _quantizer.quantize_model()
+    if model_output is not None:
+        _quantizer.model.save(model_output)
+    return _quantizer.model.model
 
 
 ###################### SmoothQuant Entry ##################################
@@ -32,7 +169,7 @@ def smooth_quant_entry(
     calibration_data_reader: data_reader.CalibrationDataReader,
     model_output: Union[pathlib.Path, str] = None,
     *args,
-    **kwargs
+    **kwargs,
 ) -> Union[pathlib.Path, str, onnx.ModelProto]:
     """Apply smooth quant."""
     assert calibration_data_reader is not None, "Please provide calibration_data_reader"
@@ -45,7 +182,7 @@ def smooth_quant_entry(
     smoother = core.Smoother(
         model,
         calibration_data_reader,
-        providers=quant_config.providers,
+        execution_provider=getattr(quant_config, "execution_provider", "CPUExecutionProvider"),
     )
     smoothed_model = smoother.transform(**quant_config.to_dict())
     with tempfile.TemporaryDirectory(prefix="ort.quant.") as tmp_dir:
@@ -65,80 +202,45 @@ def smooth_quant_entry(
 
         # exclude Mul operations which are inserted during smooth operation
         excluded_nodes = [i.name for i in smoothed_model.graph.node if i.name.endswith("_smooth_mul")]
-        quant_config.calibration_data_reader = calibration_data_reader
         quant_config.nodes_to_exclude.extend(excluded_nodes)
-        quant_config.convert_to_ort_config()
-        quantization.quantize(
+
+        q_model = static_quantize_entry(
             pathlib.Path(tmp_dir).joinpath("smooth.onnx").as_posix(),
-            model_output or pathlib.Path(tmp_dir).joinpath("quant_model.onnx").as_posix(),
             quant_config,
+            calibration_data_reader,
+            model_output,
         )
-        model = model_output or onnx.load(pathlib.Path(tmp_dir).joinpath("quant_model.onnx").as_posix())
-
-    return model
-
-
-###################### RTN Algo Entry ##################################
-@utility.register_algo(name=constants.RTN)
-def rtn_quantize_entry(
-    model: Union[pathlib.Path, str], quant_config: config.RTNConfig, *args, **kwargs
-) -> onnx.ModelProto:
-    """The main entry to apply rtn quantization."""
-    # map config to each op
-    model_info = quant_config.get_model_info(model=model)
-    configs_mapping = quant_config.to_config_mapping(model_info=model_info)
-    logger.debug(configs_mapping)
-    model = rtn.apply_rtn_on_model(model, configs_mapping)
-    return model
-
-
-###################### GPTQ Algo Entry ##################################
-@utility.register_algo(name=constants.GPTQ)
-def gptq_quantize_entry(
-    model: Union[pathlib.Path, str],
-    quant_config: config.GPTQConfig,
-    calibration_data_reader: data_reader.CalibrationDataReader,
-    *args,
-    **kwargs
-) -> onnx.ModelProto:
-    """The main entry to apply gptq quantization."""
-    assert calibration_data_reader is not None, "Please provide calibration_data_reader"
-    assert isinstance(
-        calibration_data_reader, data_reader.CalibrationDataReader
-    ), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader"
+    return q_model
 
-    # map config to each op
-    model_info = quant_config.get_model_info(model=model)
-    configs_mapping = quant_config.to_config_mapping(model_info=model_info)
-    logger.debug(configs_mapping)
 
-    # regenerate to ensure data exists
-    calibration_data_reader.rewind()
-    model = gptq.apply_gptq_on_model(model, configs_mapping, calibration_data_reader)
-    return model
-
-
-###################### AWQ Algo Entry ##################################
-@utility.register_algo(name=constants.AWQ)
-def awq_quantize_entry(
+###################### Dynamic quant Entry ##################################
+@utility.register_algo(name=constants.DYNAMIC_QUANT)
+def dynamic_quantize_entry(
     model: Union[pathlib.Path, str],
-    quant_config: config.AWQConfig,
-    calibration_data_reader: data_reader.CalibrationDataReader,
+    quant_config: config.DynamicQuantConfig,
+    model_output: Union[pathlib.Path, str] = None,
     *args,
-    **kwargs
+    **kwargs,
 ) -> onnx.ModelProto:
-    """The main entry to apply awq quantization."""
-    assert calibration_data_reader is not None, "Please provide calibration_data_reader"
-    assert isinstance(
-        calibration_data_reader, data_reader.CalibrationDataReader
-    ), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader"
-
-    # map config to each op
-    model_info = quant_config.get_model_info(model=model)
-    configs_mapping = quant_config.to_config_mapping(model_info=model_info)
-    logger.debug(configs_mapping)
-
-    # regenerate to ensure data exists
-    calibration_data_reader.rewind()
-    model = awq.apply_awq_on_model(model, configs_mapping, calibration_data_reader)
-    return model
+    """The main entry to apply dynamic quantization."""
+    if len(quant_config.op_types_to_quantize) == 0:
+        logger.warning("No candidate op type to do quantization, exit.")
+        exit(0)
+
+    if len(quant_config.config_mapping) == 0:
+        # map config to each op
+        model_info = config.DynamicQuantConfig.get_model_info(model=model)
+        config_mapping = quant_config.to_config_mapping(model_info=model_info)
+        logger.debug(config_mapping)
+    else:
+        config_mapping = quant_config.config_mapping
+
+    _quantizer = quantizer.DynamicQuantizer(
+        model,
+        config_mapping,
+        op_types_to_quantize=quant_config.op_types_to_quantize,
+    )
+    _quantizer.quantize_model()
+    if model_output is not None:
+        _quantizer.model.save(model_output)
+    return _quantizer.model.model
diff --git a/onnx_neural_compressor/quantization/calibrate.py b/onnx_neural_compressor/quantization/calibrate.py
deleted file mode 100644
index 37bf7d671..000000000
--- a/onnx_neural_compressor/quantization/calibrate.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import abc
-
-from onnxruntime import quantization
-
-
-class CalibrationDataReader(quantization.CalibrationDataReader):
-    """Get data for calibration.
-
-    We define our CalibrationDataReader based on the class in below link:
-    https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py#L139
-    """
-
-    @abc.abstractmethod
-    def rewind(self):
-        """Regenerate data."""
-        raise NotImplementedError
diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py
new file mode 100644
index 000000000..5b8dcc178
--- /dev/null
+++ b/onnx_neural_compressor/quantization/config.py
@@ -0,0 +1,2249 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import copy
+import dataclasses
+import enum
+import inspect
+import itertools
+import json
+import os
+import pathlib
+import re
+from abc import ABC, abstractmethod
+
+import numpy as np
+import onnx
+import pydantic
+from onnxruntime import quantization as ort_quant
+from typing_extensions import Self
+
+from onnx_neural_compressor import constants, data_reader, logger, quantization, utility
+
+from collections import OrderedDict  # isort: skip
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, _GenericAlias  # isort: skip
+
+
+class ParamLevel(enum.Enum):
+    OP_LEVEL = enum.auto()
+    OP_TYPE_LEVEL = enum.auto()
+    MODEL_LEVEL = enum.auto()
+
+
+class TuningParam:
+    """Define the tunable parameter for the algorithm.
+
+    Example:
+        Class FakeAlgoConfig(config.BaseConfig):
+            '''Fake algo config.'''.
+
+            params_list = [
+                ...
+                # For simple tunable types, like a list of int, giving
+                # the param name is enough. `config.BaseConfig` class will
+                # create the `TuningParam` implicitly.
+                "simple_attr"
+
+                # For complex tunable types, like a list of lists,
+                # developers need to create the `TuningParam` explicitly.
+                TuningParam("complex_attr", tunable_type=List[List[str]])
+
+                # The default parameter level is `ParamLevel.OP_LEVEL`.
+                # If the parameter is at a different level, developers need
+                # to specify it explicitly.
+                TuningParam("model_attr", level=ParamLevel.MODEL_LEVEL)
+
+            ...
+
+    # TODO: more examples to explain the usage of `TuningParam`.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        default_val: Any = None,
+        tunable_type=None,
+        options=None,
+        level: ParamLevel = ParamLevel.OP_LEVEL,
+    ) -> None:
+        self.name = name
+        self.default_val = default_val
+        self.tunable_type = tunable_type
+        self.options = options
+        self.level = level
+
+    @staticmethod
+    def create_input_args_model(expect_args_type: Any) -> type:
+        """Dynamically create an InputArgsModel based on the provided type hint.
+
+        Parameters:
+        - expect_args_type (Any): The user-provided type hint for input_args.
+
+        Returns:
+        - type: The dynamically created InputArgsModel class.
+        """
+
+        class DynamicInputArgsModel(pydantic.BaseModel):
+            input_args: expect_args_type
+
+        return DynamicInputArgsModel
+
+    def is_tunable(self, value: Any) -> bool:
+        # Use `Pydantic` to validate the input_args.
+        # TODO: refine the implementation in further.
+        assert isinstance(self.tunable_type, _GenericAlias), f"Expected a type hint, got {self.tunable_type} instead."
+        DynamicInputArgsModel = TuningParam.create_input_args_model(self.tunable_type)
+        try:
+            new_args = DynamicInputArgsModel(input_args=value)
+            return True
+        except Exception as e:
+            logger.debug(f"Failed to validate the input_args: {e}")
+            return False
+
+    def __str__(self) -> str:
+        return "TuningParam(name={}, tunable_type={}, options={}).".format(
+            self.name, str(self.tunable_type), str(self.options)
+        )
+
+
+# Config registry to store all registered configs.
+class ConfigRegistry(object):
+    registered_configs = {}
+    _config_registry = None
+
+    def __new__(cls) -> Self:
+        if cls._config_registry is None:
+            cls._config_registry = super(ConfigRegistry, cls).__new__(cls)
+
+        return cls._config_registry
+
+    @classmethod
+    def register_config_impl(cls, algo_name: str, priority: Union[float, int] = 0):
+        """Register config decorator.
+
+        The register the configuration classes for different algorithms.
+
+        Usage example:
+            @ConfigRegistry.register_config(algo_name=ExampleAlgorithm, priority=100)
+            class ExampleAlgorithmConfig:
+                # Configuration details for the ExampleAlgorithm
+
+        Args:
+            algo_name: the algorithm name.
+            priority: priority: the priority of the configuration. A larger number indicates a higher priority,
+                which will be tried first at the auto-tune stage. Defaults to 0.
+        """
+
+        def decorator(config_cls):
+            cls.registered_configs[algo_name] = {"priority": priority, "cls": config_cls}
+            return config_cls
+
+        return decorator
+
+    @classmethod
+    def get_all_configs(cls) -> Dict[str, Dict[str, Dict[str, object]]]:
+        """Get all registered configurations."""
+        return cls.registered_configs
+
+    @classmethod
+    def get_sorted_configs(cls) -> Dict[str, OrderedDict[str, Dict[str, object]]]:
+        """Get registered configurations sorted by priority."""
+        return OrderedDict(sorted(cls.registered_configs.items(), key=lambda x: x[1]["priority"], reverse=True))
+
+    @classmethod
+    def get_cls_configs(cls) -> Dict[str, Dict[str, object]]:
+        """Get registered configurations without priority."""
+        cls_configs = {}
+        for algo_name, config_data in cls.registered_configs.items():
+            cls_configs[algo_name] = config_data["cls"]
+        return cls_configs
+
+    @classmethod
+    def get_all_config_cls(cls) -> List[Type[BaseConfig]]:
+        configs_cls = []
+        for algo_name, config_pairs in cls.registered_configs.items():
+            configs_cls.append(config_pairs["cls"])
+        return configs_cls
+
+
+config_registry = ConfigRegistry()
+
+
+def register_config(algo_name: str, priority: Union[float, int] = 0):
+    """Register config decorator.
+
+    The register the configuration classes for different algorithms.
+
+    Usage example:
+        @register_config(algo_name=ExampleAlgorithm, priority=100)
+        class ExampleAlgorithmConfig:
+            # Configuration details for the ExampleAlgorithm
+
+    Args:
+        algo_name: the algorithm name.
+        priority: the priority of the configuration. A larger number indicates a higher priority,
+            which will be tried first at the auto-tune stage. Defaults to 0.
+    """
+
+    return config_registry.register_config_impl(algo_name=algo_name, priority=priority)
+
+
+class BaseConfig(ABC):
+    """The base config for all algorithm configs."""
+
+    name = constants.BASE_CONFIG
+    params_list: List[Union[str, TuningParam]] = []
+    model_params_list: List[Union[str, TuningParam]] = []
+
+    def __init__(
+        self,
+        white_list: Optional[Union[Union[str, Callable], List[Union[str, Callable]]]] = constants.DEFAULT_WHITE_LIST,
+    ) -> None:
+        self._global_config: Optional[BaseConfig] = None
+        # local config is the collections of operator_type configs and operator configs
+        self._local_config: Dict[str, Optional[BaseConfig]] = {}
+        self._white_list = white_list
+        self._config_mapping = OrderedDict()
+
+    def _post_init(self):
+        if self.white_list == constants.DEFAULT_WHITE_LIST:
+            global_config = self.get_init_args()
+            self._global_config = self.__class__(**global_config, white_list=None)
+        elif isinstance(self.white_list, list) and len(self.white_list) > 0:
+            for op_name_or_type in self.white_list:
+                global_config = self.get_init_args()
+                tmp_config = self.__class__(**global_config, white_list=None)
+                self.set_local(op_name_or_type, tmp_config)
+        elif self.white_list == constants.EMPTY_WHITE_LIST:
+            return
+        else:
+            raise NotImplementedError(
+                f"The white list should be one of {constants.DEFAULT_WHITE_LIST}, {constants.EMPTY_WHITE_LIST},"
+                " a not empty list, but got {self.white_list}"
+            )
+
+    @property
+    def config_mapping(self):
+        return self._config_mapping
+
+    @property
+    def white_list(self):
+        return self._white_list
+
+    @white_list.setter
+    def white_list(self, op_name_or_type_list: Optional[List[Union[str, Callable]]]):
+        self._white_list = op_name_or_type_list
+
+    @property
+    def global_config(self):
+        return self._global_config
+
+    @global_config.setter
+    def global_config(self, config):
+        self._global_config = config
+
+    @property
+    def local_config(self):
+        return self._local_config
+
+    @local_config.setter
+    def local_config(self, config):
+        self._local_config = config
+
+    def set_local(self, operator_name: str, config: BaseConfig) -> BaseConfig:
+        if operator_name in self.local_config and config != self.local_config[operator_name]:
+            logger.debug("The configuration for %s has already been set, update it.", operator_name)
+        self.local_config[operator_name] = config
+        return self
+
+    def to_dict(self):
+        result = {}
+        global_config = self.get_init_args()
+        if bool(self.local_config):
+            result[constants.LOCAL] = {}
+            for op_name, config in self.local_config.items():
+                result[constants.LOCAL][op_name] = config.to_dict()
+            if global_config:
+                result[constants.GLOBAL] = global_config
+        else:
+            result = global_config
+        return result
+
+    def get_params_dict(self):
+        result = dict()
+        for param, value in self.__dict__.items():
+            if param in self.params_list:
+                result[param] = value
+        return result
+
+    def get_init_args(self):
+        result = dict()
+        for param, value in self.__dict__.items():
+            if param not in ["_global_config", "_local_config", "_white_list", "_config_mapping"]:
+                result[param] = value
+        return result
+
+    def __getitem__(self, key):
+        if hasattr(self, key):
+            return getattr(self, key)
+        else:
+            raise KeyError(f"No such attribute: {key}")
+
+    def __setitem__(self, key, value):
+        setattr(self, key, value)
+
+    @classmethod
+    def from_dict(cls, config_dict):
+        """Construct config from a dict.
+
+        Args:
+            config_dict: _description_
+
+        Returns:
+            The constructed config.
+        """
+        if constants.GLOBAL not in config_dict and constants.LOCAL not in config_dict:
+            config = cls(**config_dict)
+            return config
+        else:
+            config = cls(**config_dict.get(constants.GLOBAL, {}))
+            operator_config = config_dict.get(constants.LOCAL, {})
+            if operator_config:
+                for op_name, op_config in operator_config.items():
+                    config.set_local(op_name, cls(**op_config, white_list=None))
+            return config
+
+    def get_diff_dict(self, config) -> Dict[str, Any]:
+        """Get the difference between current config and user-specific config."""
+        diff_cfg = {}
+        for name, cfg in self.get_init_args().items():
+            if hasattr(config, name):
+                if isinstance(cfg, BaseConfig) and isinstance(config[name], BaseConfig):
+                    diff_cfg[name] = cfg.get_diff_dict(config[name])
+                elif cfg != config[name]:
+                    diff_cfg[name] = cfg
+            else:
+                diff_cfg[name] = cfg
+        return diff_cfg
+
+    @classmethod
+    def from_json_file(cls, filename):
+        with open(filename, "r", encoding="utf-8") as file:
+            config_dict = json.load(file)
+        return cls.from_dict(**config_dict)
+
+    def to_json_file(self, filename):
+        config_dict = self.to_dict()
+        with open(filename, "w", encoding="utf-8") as file:
+            json.dump(config_dict, file, indent=4)
+        logger.info("Dump the config into %s.", filename)
+
+    def to_json_string(self, use_diff: bool = False) -> Union[str, Dict]:
+        """Serializes this instance to a JSON string.
+
+        Args:
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `BaseConfig()`
+                is serialized to JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict(self)
+        else:
+            config_dict = self.to_dict()
+        try:
+            return json.dumps(config_dict, indent=2) + "\n"
+        except Exception as e:
+            logger.error("Failed to serialize the config to JSON string: %s", e)
+            return config_dict
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    @classmethod
+    @abstractmethod
+    def register_supported_configs(cls):
+        """Add all supported configs."""
+        raise NotImplementedError
+
+    @classmethod
+    def validate(self, user_config: BaseConfig):
+        # TODO validate the user config
+        pass
+
+    def __add__(self, other: BaseConfig) -> BaseConfig:
+        if isinstance(other, type(self)):
+            for op_name, config in other.local_config.items():
+                self.set_local(op_name, config)
+            return self
+        else:
+            return ComposableConfig(configs=[self, other])
+
+    @staticmethod
+    def get_the_default_value_of_param(config: BaseConfig, param: str) -> Any:
+        # Get the signature of the __init__ method
+        signature = inspect.signature(config.__init__)
+
+        # Get the parameters and their default values
+        parameters = signature.parameters
+        return parameters.get(param).default if parameters.get(param) is not None else None
+
+    @staticmethod
+    def build_tuning_param(config: BaseConfig, param: str):
+        # Create `tuning.TuningParam` for each param
+        # There are two cases:
+        # 1. The param is a string.
+        # 2. The param is a `tuning.TuningParam` instance.
+        if isinstance(param, str):
+            signature = inspect.signature(config.__init__)
+            parameters = signature.parameters
+            default_param = parameters.get(param).default if parameters.get(param) is not None else None
+            tuning_param = TuningParam(name=param, tunable_type=List[type(default_param)])
+        elif isinstance(param, TuningParam):
+            tuning_param = param
+        else:
+            raise ValueError(f"Unsupported param type: {param}")
+        return tuning_param
+
+    def expand(self) -> List[BaseConfig]:
+        """Expand the config.
+
+        Expand rule is:
+            1. Expand model_params_list first, then expand params_list
+            2. Expand model_params_list/params_list following the order of param order in model_params_list/params_list
+
+            model_params_list=[A, B]                params_list=[C,D]
+            A=[1,2], B=[3,4]                        C=[5,6], D=[7,8]
+
+            Expanded results:
+                                    --------    Combination 1 (C=5, D=7)
+                                   /
+                                  / --------    Combination 2 (C=6, D=7)
+               Combination 1  ----
+                (A=1, B=3)        \ --------    Combination 3 (C=5, D=8)
+                                   \
+                                    --------    Combination 4 (C=6, D=8)
+
+                                    --------    Combination 1 (C=5, D=7)
+                                   /
+                                  / --------    Combination 2 (C=6, D=7)
+               Combination 2  ----
+                (A=2, B=3)        \ --------    Combination 3 (C=5, D=8)
+                                   \
+                                    --------    Combination 4 (C=6, D=8)
+
+                                    --------    Combination 1 (C=5, D=7)
+                                   /
+                                  / --------    Combination 2 (C=6, D=7)
+               Combination 3  ----
+                (A=1, B=4)        \ --------    Combination 3 (C=5, D=8)
+                                   \
+                                    --------    Combination 4 (C=6, D=8)
+
+                                    --------    Combination 1 (C=5, D=7)
+                                   /
+                                  / --------    Combination 2 (C=6, D=7)
+               Combination 4  ----
+                (A=2, B=4)        \ --------    Combination 3 (C=5, D=8)
+                                   \
+                                    --------    Combination 4 (C=6, D=8)
+        """
+        config = self
+        # set model level params
+        model_level_config_lst: List[BaseConfig] = []
+        model_params_list = getattr(self, "model_params_list", [])
+        tuning_param_list = []
+        for param in model_params_list:
+            tuning_param = self.build_tuning_param(config, param)
+            param_val = getattr(config, tuning_param.name)
+            if param_val is not None:
+                if tuning_param.is_tunable(param_val):
+                    tuning_param.options = param_val
+                    tuning_param_list.append(tuning_param)
+
+        if len(tuning_param_list) == 0:
+            model_level_config_lst = [config]
+        else:
+            tuning_param_name_lst = [tuning_param.name for tuning_param in tuning_param_list]
+            for params_values in itertools.product(*[tuning_param.options for tuning_param in tuning_param_list[::-1]]):
+                new_config = copy.deepcopy(self)
+                for param_name, param_value in zip(tuning_param_name_lst[::-1], params_values):
+                    setattr(new_config, param_name, param_value)
+                logger.debug(new_config.to_dict())
+                model_level_config_lst.append(new_config)
+
+        # set op level params
+        op_params_list = self.params_list
+        op_tuning_param_list = []
+        local_op_level_config_lst = []
+
+        for param in op_params_list:
+            tuning_param = self.build_tuning_param(config, param)
+            param_val = getattr(config, tuning_param.name)
+            if param_val is not None:
+                if tuning_param.is_tunable(param_val) and len(param_val) > 0:
+                    tuning_param.options = param_val
+                    op_tuning_param_list.append(tuning_param)
+
+        if len(op_tuning_param_list) == 0:
+            local_op_level_config_lst = model_level_config_lst
+        else:
+            tuning_param_name_lst = [tuning_param.name for tuning_param in op_tuning_param_list]
+            tuning_param_val_lst = list(
+                itertools.product(*[tuning_param.options for tuning_param in op_tuning_param_list[::-1]])
+            )
+            tuning_param_pair_lst = [dict(zip(tuning_param_name_lst[::-1], val)) for val in tuning_param_val_lst]
+
+            for model_level_config in model_level_config_lst:
+                for tuning_param_pair in tuning_param_pair_lst:
+                    new_config = copy.deepcopy(model_level_config)
+                    for name, val in tuning_param_pair.items():
+                        setattr(new_config, name, val)
+                        for _, cfg in new_config.local_config.items():
+                            if isinstance(getattr(cfg, name, None), list) and val in getattr(cfg, name, None):
+                                setattr(cfg, name, val)
+                    logger.debug(new_config.to_dict())
+                    local_op_level_config_lst.append(new_config)
+
+        logger.info("Expanded the %s and got %d configs.", self.__class__.name, len(local_op_level_config_lst))
+        return local_op_level_config_lst
+
+    def _get_op_name_op_type_config(self):
+        op_type_config_dict = dict()
+        op_name_config_dict = dict()
+        for name, config in self.local_config.items():
+            if self._is_op_type(name):
+                op_type_config_dict[name] = config
+            else:
+                op_name_config_dict[name] = config
+        return op_type_config_dict, op_name_config_dict
+
+    def to_config_mapping(
+        self, config_list: Optional[List[BaseConfig]] = None, model_info: List[Tuple[str, str]] = None
+    ) -> OrderedDict[Tuple[str, str], OrderedDict[str, BaseConfig]]:
+        if config_list is None:
+            config_list = [self]
+        for config in config_list:
+            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
+            for op_name, op_type in model_info:
+                if op_type in op_type_config_dict:
+                    self._config_mapping[op_name] = op_name_config_dict[op_type]
+                for op_name_pattern in op_name_config_dict:
+                    if isinstance(op_name, str) and re.match(op_name_pattern, op_name):
+                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
+                    elif op_name_pattern == op_name:
+                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
+        return self._config_mapping
+
+    @staticmethod
+    def _is_op_type(name: str) -> bool:
+        return name in constants.STATIC_QOPERATOR_CPU_OP_LIST or name in constants.DYNAMIC_CPU_OP_LIST
+
+    @classmethod
+    @abstractmethod
+    def get_config_set_for_tuning(cls):
+        raise NotImplementedError
+
+    def __eq__(self, other: BaseConfig) -> bool:
+        if not isinstance(other, type(self)):
+            return False
+        return self.get_init_args() == other.get_init_args()
+
+
+class ComposableConfig(BaseConfig):
+    name = constants.COMPOSABLE_CONFIG
+
+    def __init__(self, configs: List[BaseConfig]) -> None:
+        self.config_list = configs
+        self._config_mapping = OrderedDict()
+
+    def __add__(self, other: BaseConfig) -> BaseConfig:
+        if isinstance(other, type(self)):
+            self.config_list.extend(other.config_list)
+        else:
+            self.config_list.append(other)
+        return self
+
+    def to_dict(self):
+        result = {}
+        for config in self.config_list:
+            result[config.name] = config.to_dict()
+        return result
+
+    @classmethod
+    def from_dict(cls, config_dict: OrderedDict[str, Dict], config_registry: Dict[str, BaseConfig]):
+        assert len(config_dict) >= 1, "The config dict must include at least one configuration."
+        num_configs = len(config_dict)
+        name, value = next(iter(config_dict.items()))
+        config = config_registry[name].from_dict(value)
+        for _ in range(num_configs - 1):
+            name, value = next(iter(config_dict.items()))
+            config += config_registry[name].from_dict(value)
+        return config
+
+    def to_json_string(self, use_diff: bool = False) -> str:
+        return json.dumps(self.to_dict(), indent=2) + "\n"
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def to_config_mapping(
+        self, config_list: List[BaseConfig] = None, model_info: Dict[str, Any] = None
+    ) -> OrderedDict[str, BaseConfig]:
+        for config in self.config_list:
+            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
+            single_config_model_info = model_info.get(config.name, None)
+            for op_name, op_type in single_config_model_info:
+                if op_type in op_type_config_dict:
+                    self._config_mapping[op_name] = op_name_config_dict[op_type]
+                for op_name_pattern in op_name_config_dict:
+                    if re.match(op_name_pattern, op_name):
+                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
+        return self._config_mapping
+
+    @classmethod
+    def register_supported_configs(cls):
+        """Add all supported configs."""
+        raise NotImplementedError
+
+    @classmethod
+    def get_config_set_for_tuning(cls) -> None:
+        # TODO handle the composable config in `tuning_config`
+        return None
+
+    def get_model_info(self, model, *args, **kwargs):
+        model_info_dict = dict()
+        for config in self.config_list:
+            model_info_dict.update({config.name: config.get_model_info(model, *args, **kwargs)})
+        return model_info_dict
+
+
+def get_all_config_set_from_config_registry() -> List[BaseConfig]:
+    all_registered_config_cls: List[Type[BaseConfig]] = config_registry.get_all_config_cls()
+    config_set = []
+    for config_cls in all_registered_config_cls:
+        config_set.append(config_cls.get_config_set_for_tuning())
+    return config_set
+
+
+def register_supported_configs():
+    """Register supported configs."""
+    all_registered_config_cls: List[Type[BaseConfig]] = config_registry.get_all_config_cls()
+    for config_cls in all_registered_config_cls:
+        config_cls.register_supported_configs()
+
+
+@dataclasses.dataclass
+class OperatorConfig:
+    weight_type: quantization.QuantType
+    activation_type: quantization.QuantType
+    per_channel: bool
+    weight_sym: bool
+    activation_sym: bool
+    calibrate_method: quantization.CalibrationMethod = quantization.CalibrationMethod.MinMax
+
+    def __post_init__(self):
+        self.weight_type = getattr(self.weight_type, "tensor_type", self.weight_type)
+        self.activation_type = getattr(self.activation_type, "tensor_type", self.activation_type)
+        self.calibrate_method = getattr(self.calibrate_method, "name", self.calibrate_method)
+
+    def __getitem__(self, key):
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        setattr(self, key, value)
+
+    def __contains__(self, key):
+        return hasattr(self, key)
+
+    def update(self, kwargs):
+        self.weight_type = kwargs.get("weight_type", self.weight_type)
+        self.activation_type = kwargs.get("activation_type", self.activation_type)
+        self.per_channel = kwargs.get("per_channel", self.per_channel)
+        self.weight_sym = kwargs.get("weight_sym", self.weight_sym)
+        self.calibrate_method = kwargs.get("calibrate_method", self.calibrate_method)
+
+    def to_dict(self):
+        result = {}
+        for key, val in self.__dict__.items():
+            if not isinstance(val, list):
+                result[key] = (
+                    getattr(val, "tensor_type", val)
+                    if isinstance(val, quantization.QuantType)
+                    else getattr(val, "value", val)
+                )
+            else:
+                result[key] = [
+                    (
+                        getattr(item, "tensor_type", item)
+                        if isinstance(item, quantization.QuantType)
+                        else getattr(item, "value", item)
+                    )
+                    for item in val
+                ]
+        return result
+
+    def __eq__(self, other):
+        if isinstance(other, OperatorConfig):
+            return self.to_dict() == other.to_dict()
+        else:
+            return self.to_dict() == other
+
+
+class _OperatorConfig(NamedTuple):
+    config: OperatorConfig
+    operators: List[Union[str, Callable]]
+    valid_func_list: List[Callable] = []
+
+
+######################## RNT Config ###############################
+
+
+@register_config(algo_name=constants.RTN, priority=constants.PRIORITY_RTN)
+class RTNConfig(BaseConfig):
+    """Config class for round-to-nearest weight-only quantization."""
+
+    supported_configs: List[_OperatorConfig] = []
+    params_list: List[Union[str, TuningParam]] = [
+        "weight_dtype",
+        "weight_bits",
+        "weight_group_size",
+        "weight_sym",
+        "act_dtype",
+        "accuracy_level",
+        "ratios",
+    ]
+    model_params_list: List[str] = [
+        "providers",
+        "layer_wise_quant",
+    ]
+    name: str = constants.RTN
+
+    def __init__(
+        self,
+        weight_dtype: str = "int",
+        weight_bits: int = 4,
+        weight_group_size: int = 32,
+        weight_sym: bool = True,
+        act_dtype: str = "fp32",
+        accuracy_level: int = 0,
+        ratios: dict = {},
+        providers: List[str] = ["CPUExecutionProvider"],
+        layer_wise_quant: bool = False,
+        quant_last_matmul: bool = True,
+        white_list: List[Union[str, Callable]] = constants.RTN_OP_LIST,
+    ):
+        """Init RTN weight-only quantization config.
+
+        Args:
+            weight_dtype (str, optional): Data type for weights, default is "int".
+            weight_bits (int, optional): Number of bits used to represent weights, default is 4.
+            weight_group_size (int, optional): Size of weight groups, default is 32.
+            weight_sym (bool, optional): Indicates whether weights are symmetric, default is True.
+            act_dtype (str, optional): Data type for activations, default is "fp32".
+            accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
+                2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
+                4 (int8 compute type of jblas kernel). Defaults to 0.
+            ratios (dict, optional): percentile of clip. Defaults to {}.
+            providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"].
+            layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint.
+                Check below link for details
+                https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md,
+                default is False.
+            quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True.
+            white_list (list, optional): op in white_list will be applied current config.
+                Defaults to constants.DEFAULT_WHITE_LIST.
+        """
+        super().__init__(white_list=white_list)
+        self.weight_bits = weight_bits
+        self.weight_dtype = weight_dtype
+        self.weight_group_size = weight_group_size
+        self.weight_sym = weight_sym
+        self.act_dtype = act_dtype
+        self.accuracy_level = accuracy_level
+        self.ratios = ratios
+        self.providers = providers
+        self.layer_wise_quant = layer_wise_quant
+        self.quant_last_matmul = quant_last_matmul
+        self._post_init()
+
+    def _post_init(self):
+        if self.white_list == constants.RTN_OP_LIST:
+            global_config = self.get_init_args()
+            self._global_config = self.__class__(**global_config, white_list=None)
+        elif isinstance(self.white_list, list) and len(self.white_list) > 0:
+            for op_name_or_type in self.white_list:
+                global_config = self.get_init_args()
+                tmp_config = self.__class__(**global_config, white_list=None)
+                self.set_local(op_name_or_type, tmp_config)
+        elif self.white_list == constants.EMPTY_WHITE_LIST:
+            return
+
+    def get_model_params_dict(self):
+        result = dict()
+        for param in self.model_params_list:
+            result[param] = getattr(self, param)
+        return result
+
+    @classmethod
+    def register_supported_configs(cls) -> None:
+        supported_configs = []
+        linear_rtn_config = RTNConfig(
+            weight_dtype=["int"],
+            weight_bits=[1, 2, 3, 4, 5, 6, 7, 8],
+            weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024],
+            weight_sym=[True, False],
+            act_dtype=["fp32"],
+        )
+        operators = constants.RTN_OP_LIST
+        supported_configs.append(_OperatorConfig(config=linear_rtn_config, operators=operators))
+        cls.supported_configs = supported_configs
+
+    def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: list = None):
+        if config_list is None:
+            config_list = [self]
+        for config in config_list:
+            # update model level setting
+            self._config_mapping.update(config.get_model_params_dict())
+
+            # update node level setting
+            last_matmul = None
+            global_config = config.get_params_dict()
+            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
+            for op_name, op_type in model_info:
+                if op_type == "MatMul":
+                    last_matmul = op_name
+                if global_config is not None:
+                    self._config_mapping[op_name] = global_config
+                if op_type in op_type_config_dict:
+                    self._config_mapping[op_name] = op_type_config_dict[op_type]
+                for op_name_pattern in op_name_config_dict:
+                    if re.match(op_name_pattern, op_name):
+                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
+                if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"):
+                    self._config_mapping[op_name] = self._config_mapping[op_name].to_dict()
+        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
+            del self._config_mapping[last_matmul]
+        return self._config_mapping
+
+    @staticmethod
+    def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.RTN_OP_LIST) -> list:
+        if not isinstance(model, onnx.ModelProto):
+            model = onnx.load(model, load_external_data=False)
+
+        filter_result = []
+        for node in model.graph.node:
+            if node.op_type in white_list:
+                pair = (node.name, node.op_type)
+                filter_result.append(pair)
+        logger.debug(f"Get model info: {filter_result}")
+        return filter_result
+
+    @classmethod
+    def get_config_set_for_tuning(cls) -> Union[None, "RTNConfig", List["RTNConfig"]]:  # pragma: no cover
+        return RTNConfig(weight_bits=[4, 8], weight_sym=[True, False])
+
+
+def get_default_rtn_config() -> RTNConfig:
+    """Generate the default rtn config.
+
+    Returns:
+        the default rtn config.
+    """
+    return RTNConfig()
+
+
+######################## GPTQ Config ###############################
+
+
+@register_config(algo_name=constants.GPTQ, priority=constants.PRIORITY_GPTQ)
+class GPTQConfig(BaseConfig):
+    """Config class for gptq weight-only quantization."""
+
+    supported_configs: List[_OperatorConfig] = []
+    params_list: List[Union[str, TuningParam]] = [
+        "weight_dtype",
+        "weight_bits",
+        "weight_group_size",
+        "weight_sym",
+        "act_dtype",
+        "accuracy_level",
+    ]
+    model_params_list: List[Union[str, TuningParam]] = [
+        "percdamp",
+        "block_size",
+        "actorder",
+        "mse",
+        "perchannel",
+        "providers",
+        "layer_wise_quant",
+    ]
+    name: str = constants.GPTQ
+
+    def __init__(
+        self,
+        weight_dtype: str = "int",
+        weight_bits: int = 4,
+        weight_group_size: int = 32,
+        weight_sym: bool = True,
+        act_dtype: str = "fp32",
+        accuracy_level: int = 0,
+        percdamp: float = 0.01,
+        block_size: int = 128,
+        actorder: bool = False,
+        mse: bool = False,
+        perchannel: bool = True,
+        providers: List[str] = ["CPUExecutionProvider"],
+        layer_wise_quant: bool = False,
+        quant_last_matmul: bool = True,
+        white_list: List[Union[str, Callable]] = constants.GPTQ_OP_LIST,
+    ):
+        """Init GPTQ weight-only quantization config.
+
+        Args:
+            weight_dtype (str, optional): data type for weights. Defaults to "int".
+            weight_bits (int, optional): number of bits used to represent weights. Defaults to 4.
+            weight_group_size (int, optional): size of weight groups. Defaults to 32.
+            weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True.
+            act_dtype (str, optional): data type for activations. Defaults to "fp32".
+            accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
+                2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
+                4 (int8 compute type of jblas kernel). Defaults to 0.
+            percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added
+                to Hessian's diagonal to increase numerical stability. Defaults to 0.01.
+            block_size (int, optional): execute GPTQ quantization per block. Defaults to 128.
+            actorder (bool, optional): whether to sort Hessian's diagonal values to rearrange channel-wise
+                quantization order. Defaults to False.
+            mse (bool, optional): whether get scale and zero point with mse error. Defaults to False.
+            perchannel (bool, optional): whether quantize weight per-channel. Defaults to True.
+            providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"].
+            layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint.
+                Check below link for details
+                https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md,
+                default is False.
+            quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True.
+            white_list (list, optional): op in white_list will be applied current config.
+                Defaults to constants.DEFAULT_WHITE_LIST.
+        """
+        super().__init__(white_list=white_list)
+        self.weight_bits = weight_bits
+        self.weight_dtype = weight_dtype
+        self.weight_group_size = weight_group_size
+        self.weight_sym = weight_sym
+        self.act_dtype = act_dtype
+        self.accuracy_level = accuracy_level
+        self.percdamp = percdamp
+        self.block_size = block_size
+        self.actorder = actorder
+        self.mse = mse
+        self.perchannel = perchannel
+        self.providers = providers
+        self.layer_wise_quant = layer_wise_quant
+        self.quant_last_matmul = quant_last_matmul
+        self._post_init()
+
+    def _post_init(self):
+        if self.white_list == constants.GPTQ_OP_LIST:
+            global_config = self.get_init_args()
+            self._global_config = self.__class__(**global_config, white_list=None)
+        elif isinstance(self.white_list, list) and len(self.white_list) > 0:
+            for op_name_or_type in self.white_list:
+                global_config = self.get_init_args()
+                tmp_config = self.__class__(**global_config, white_list=None)
+                self.set_local(op_name_or_type, tmp_config)
+        elif self.white_list == constants.EMPTY_WHITE_LIST:
+            return
+
+    def get_model_params_dict(self):
+        result = dict()
+        for param in self.model_params_list:
+            result[param] = getattr(self, param)
+        return result
+
+    @classmethod
+    def register_supported_configs(cls) -> None:
+        supported_configs = []
+        linear_gptq_config = GPTQConfig(
+            weight_dtype=["int"],
+            weight_bits=[1, 2, 3, 4, 5, 6, 7, 8],
+            weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024],
+            weight_sym=[True, False],
+            act_dtype=["fp32"],
+            actorder=[True, False],
+            mse=[True, False],
+            perchannel=[True, False],
+        )
+        operators = constants.GPTQ_OP_LIST
+        supported_configs.append(_OperatorConfig(config=linear_gptq_config, operators=operators))
+        cls.supported_configs = supported_configs
+
+    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
+        if config_list is None:
+            config_list = [self]
+        for config in config_list:
+            # update model level setting
+            self._config_mapping.update(config.get_model_params_dict())
+
+            # update node level setting
+            last_matmul = None
+            global_config = config.get_params_dict()
+            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
+            for op_name, op_type in model_info:
+                if op_type == "MatMul":
+                    last_matmul = op_name
+                if global_config is not None:
+                    self._config_mapping[op_name] = global_config
+                if op_type in op_type_config_dict:
+                    self._config_mapping[op_name] = op_type_config_dict[op_type]
+                for op_name_pattern in op_name_config_dict:
+                    if re.match(op_name_pattern, op_name):
+                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
+                if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"):
+                    self._config_mapping[op_name] = self._config_mapping[op_name].to_dict()
+        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
+            del self._config_mapping[last_matmul]
+        return self._config_mapping
+
+    @staticmethod
+    def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.GPTQ_OP_LIST) -> list:
+        if not isinstance(model, onnx.ModelProto):
+            model = onnx.load(model, load_external_data=False)
+
+        filter_result = []
+        for node in model.graph.node:
+            if node.op_type in white_list:
+                pair = (node.name, node.op_type)
+                filter_result.append(pair)
+        logger.debug(f"Get model info: {filter_result}")
+        return filter_result
+
+    @classmethod
+    def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig"]]:  # pragma: no cover
+        return GPTQConfig(
+            weight_bits=[4, 8],
+            weight_sym=[True, False],
+            actorder=[True, False],
+            mse=[True, False],
+            perchannel=[True, False],
+        )
+
+
+def get_default_gptq_config() -> GPTQConfig:
+    """Generate the default gptq config.
+
+    Returns:
+        the default gptq config.
+    """
+    return GPTQConfig()
+
+
+######################## AWQ Config ###############################
+
+
+@register_config(algo_name=constants.AWQ, priority=constants.PRIORITY_AWQ)
+class AWQConfig(BaseConfig):
+    """Config class for awq weight-only quantization."""
+
+    supported_configs: List[_OperatorConfig] = []
+    params_list: List[str] = [
+        "weight_dtype",
+        "weight_bits",
+        "weight_group_size",
+        "weight_sym",
+        "act_dtype",
+        "accuracy_level",
+    ]
+    model_params_list: List[str] = [
+        "enable_auto_scale",
+        "enable_mse_search",
+        "providers",
+    ]
+    name: str = constants.AWQ
+
+    def __init__(
+        self,
+        weight_dtype: str = "int",
+        weight_bits: int = 4,
+        weight_group_size: int = 32,
+        weight_sym: bool = True,
+        act_dtype: str = "fp32",
+        accuracy_level: int = 0,
+        enable_auto_scale: bool = True,
+        enable_mse_search: bool = True,
+        providers: List[str] = ["CPUExecutionProvider"],
+        quant_last_matmul: bool = True,
+        white_list: List[Union[str, Callable]] = constants.AWQ_OP_LIST,
+    ):
+        """Init AWQ weight-only quantization config.
+
+        Args:
+            weight_dtype (str, optional): data type for weights. Defaults to "int".
+            weight_bits (int, optional): number of bits used to represent weights. Defaults to 4.
+            weight_group_size (int, optional): size of weight groups. Defaults to 32.
+            weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True.
+            act_dtype (str, optional): data type for activations. Defaults to "fp32".
+            accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
+                2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
+                4 (int8 compute type of jblas kernel). Defaults to 0.
+            enable_auto_scale (bool, optional): whether to search for best scales based on activation distribution.
+                Defaults to True.
+            enable_mse_search (bool, optional): whether to search for the best clip range from range
+                [0.91, 1.0, 0.01]. Defaults to True.
+            providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"].
+            quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True.
+            white_list (list, optional): op in white_list will be applied current config.
+                Defaults to constants.DEFAULT_WHITE_LIST.
+        """
+        super().__init__(white_list=white_list)
+        self.weight_bits = weight_bits
+        self.weight_dtype = weight_dtype
+        self.weight_group_size = weight_group_size
+        self.weight_sym = weight_sym
+        self.act_dtype = act_dtype
+        self.accuracy_level = accuracy_level
+        self.enable_auto_scale = enable_auto_scale
+        self.enable_mse_search = enable_mse_search
+        self.providers = providers
+        self.quant_last_matmul = quant_last_matmul
+        self._post_init()
+
+    def _post_init(self):
+        if self.white_list == constants.GPTQ_OP_LIST:
+            global_config = self.get_init_args()
+            self._global_config = self.__class__(**global_config, white_list=None)
+        elif isinstance(self.white_list, list) and len(self.white_list) > 0:
+            for op_name_or_type in self.white_list:
+                global_config = self.get_init_args()
+                tmp_config = self.__class__(**global_config, white_list=None)
+                self.set_local(op_name_or_type, tmp_config)
+        elif self.white_list == constants.EMPTY_WHITE_LIST:
+            return
+
+    def get_model_params_dict(self):
+        result = dict()
+        for param in self.model_params_list:
+            result[param] = getattr(self, param)
+        return result
+
+    @classmethod
+    def register_supported_configs(cls) -> List[_OperatorConfig]:
+        supported_configs = []
+        linear_awq_config = AWQConfig(
+            weight_dtype=["int"],
+            weight_bits=[1, 2, 3, 4, 5, 6, 7, 8],
+            weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024],
+            weight_sym=[True, False],
+            act_dtype=["fp32"],
+            enable_auto_scale=[True, False],
+            enable_mse_search=[True, False],
+        )
+        operators = constants.AWQ_OP_LIST
+        supported_configs.append(_OperatorConfig(config=linear_awq_config, operators=operators))
+        cls.supported_configs = supported_configs
+
+    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
+        if config_list is None:
+            config_list = [self]
+        for config in config_list:
+            # update model level setting
+            self._config_mapping.update(config.get_model_params_dict())
+
+            # update node level setting
+            last_matmul = None
+            global_config = config.get_params_dict()
+            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
+            for op_name, op_type in model_info:
+                if op_type == "MatMul":
+                    last_matmul = op_name
+                if global_config is not None:
+                    self._config_mapping[op_name] = global_config
+                if op_type in op_type_config_dict:
+                    self._config_mapping[op_name] = op_type_config_dict[op_type]
+                for op_name_pattern in op_name_config_dict:
+                    if re.match(op_name_pattern, op_name):
+                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
+                if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"):
+                    self._config_mapping[op_name] = self._config_mapping[op_name].to_dict()
+        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
+            del self._config_mapping[last_matmul]
+        return self._config_mapping
+
+    @staticmethod
+    def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.AWQ_OP_LIST) -> list:
+        if not isinstance(model, onnx.ModelProto):
+            model = onnx.load(model, load_external_data=False)
+
+        filter_result = []
+        for node in model.graph.node:
+            if node.op_type in white_list:
+                pair = (node.name, node.op_type)
+                filter_result.append(pair)
+        logger.debug(f"Get model info: {filter_result}")
+        return filter_result
+
+    @classmethod
+    def get_config_set_for_tuning(cls) -> Union[None, "AWQConfig", List["AWQConfig"]]:  # pragma: no cover
+        return AWQConfig(
+            weight_bits=[4, 8],
+            weight_sym=[True, False],
+            enable_auto_scale=[True, False],
+            enable_mse_search=[True, False],
+        )
+
+
+def get_default_awq_config() -> AWQConfig:
+    """Generate the default awq config.
+
+    Returns:
+        the default awq config.
+    """
+    return AWQConfig()
+
+
+######################## WOQ Tuning Config ###############################
+
+
+def get_woq_tuning_config() -> list:
+    """Generate the config set for WOQ tuning.
+
+    Returns:
+        the list of WOQ quant config.
+    """
+    RTN_G32ASYM = RTNConfig(weight_sym=False)
+    GPTQ_G32ASYM = GPTQConfig(weight_sym=False)
+    GPTQ_G32ASYM_DISABLE_LAST_MATMUL = GPTQConfig(weight_sym=False, quant_last_matmul=False)
+    GPTQ_G128ASYM = GPTQConfig(weight_group_size=128, weight_sym=False)
+    AWQ_G32ASYM = AWQConfig(weight_sym=False)
+    return [RTN_G32ASYM, GPTQ_G32ASYM, GPTQ_G32ASYM_DISABLE_LAST_MATMUL, GPTQ_G128ASYM, AWQ_G32ASYM]
+
+
+##################### Config for ONNXRuntime-like user-facing API ############
+
+
+class ExtraOptions:
+    def __init__(
+        self,
+        ActivationSymmetric=False,
+        WeightSymmetric=True,
+        AddQDQPairToWeight=False,
+        OpTypesToExcludeOutputQuantization=[],
+        DedicatedQDQPair=False,
+        SmoothQuant=False,
+        SmoothQuantAlpha=0.5,
+        SmoothQuantFolding=True,
+        SmoothQuantOpTypes=["Gemm", "Conv", "MatMul", "FusedConv"],
+        SmoothQuantCalibIter=100,
+        SmoothQuantScalesPerOp=True,
+        **kwargs,
+    ):
+        self.ActivationSymmetric = ActivationSymmetric
+        self.WeightSymmetric = WeightSymmetric
+        self.AddQDQPairToWeight = AddQDQPairToWeight
+        self.OpTypesToExcludeOutputQuantization = OpTypesToExcludeOutputQuantization
+        self.DedicatedQDQPair = DedicatedQDQPair
+        self.SmoothQuant = SmoothQuant
+        self.SmoothQuantAlpha = SmoothQuantAlpha
+        self.SmoothQuantFolding = SmoothQuantFolding
+        self.SmoothQuantOpTypes = SmoothQuantOpTypes
+        self.SmoothQuantCalibIter = SmoothQuantCalibIter
+        self.SmoothQuantScalesPerOp = SmoothQuantScalesPerOp
+
+
+def static_basic_check(config, optype, execution_provider, quant_format):
+    if getattr(quant_format, "value", quant_format) == 0:
+        if execution_provider not in constants.STATIC_QOPERATOR_OP_LIST_MAP:
+            raise ValueError(
+                "Unsupported execution_provider {}, only support {}.".format(
+                    execution_provider, list(constants.STATIC_QOPERATOR_OP_LIST_MAP.keys())
+                )
+            )
+        supported_optype = constants.STATIC_QOPERATOR_OP_LIST_MAP[execution_provider]
+        if optype not in supported_optype:
+            raise ValueError(
+                "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)
+            )
+    elif getattr(quant_format, "value", quant_format) == 1:
+        if execution_provider not in constants.STATIC_QDQ_OP_LIST_MAP:
+            raise ValueError(
+                "Unsupported execution_provider {}, only support {}.".format(
+                    execution_provider, list(constants.STATIC_QDQ_OP_LIST_MAP.keys())
+                )
+            )
+        supported_optype = constants.STATIC_QDQ_OP_LIST_MAP[execution_provider]
+        if optype not in supported_optype:
+            raise ValueError(
+                "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)
+            )
+    else:
+        raise ValueError(
+            "Unsupported quant_format {}, only support QuantFormat.QOperator and QuantFormat.QDQ.".format(quant_format)
+        )
+    return config
+
+
+def static_cpu_check(config, optype, execution_provider, quant_format):
+    if execution_provider != "CPUExecutionProvider":
+        return config
+
+    # only support per-tensor
+    if optype in [
+        "EmbedLayerNormalization",
+        "Relu",
+        "Clip",
+        "LeakyRelu",
+        "Sigmoid",
+        "MaxPool",
+        "GlobalAveragePool",
+        "Pad",
+        "Split",
+        "Squeeze",
+        "Reshape",
+        "Concat",
+        "AveragePool",
+        "Tile",
+        "Unsqueeze",
+        "Transpose",
+        "Resize",
+        "Abs",
+        "Shrink",
+        "Sign",
+        "Attention",
+        "Flatten",
+        "Expand",
+        "Slice",
+        "Mod",
+        "ReduceMax",
+        "ReduceMin",
+        "CenterCropPad",
+        "Add",
+        "Mul",
+        "ArgMax",
+    ]:
+        setattr(config, "per_channel", False)
+
+    if optype in ["Attention"]:
+        setattr(config, "activation_type", onnx.TensorProto.UINT8)
+    return config
+
+
+def static_cuda_check(config, optype, execution_provider, quant_format):
+    if execution_provider != "CUDAExecutionProvider":
+        return config
+
+    # only support per-tensor
+    if optype in [
+        "EmbedLayerNormalization",
+        "Relu",
+        "Clip",
+        "LeakyRelu",
+        "Sigmoid",
+        "MaxPool",
+        "GlobalAveragePool",
+        "Pad",
+        "Split",
+        "Squeeze",
+        "Reshape",
+        "Concat",
+        "AveragePool",
+        "Tile",
+        "Unsqueeze",
+        "Transpose",
+        "Resize",
+        "Abs",
+        "Shrink",
+        "Sign",
+        "Attention",
+        "Flatten",
+        "Expand",
+        "Slice",
+        "Mod",
+        "ReduceMax",
+        "ReduceMin",
+        "CenterCropPad",
+        "Add",
+        "Mul",
+        "ArgMax",
+    ]:
+        setattr(config, "per_channel", False)
+
+    if optype in ["Attention"]:
+        setattr(config, "activation_type", onnx.TensorProto.INT8)
+        setattr(config, "weight_type", onnx.TensorProto.INT8)
+    return config
+
+
+def static_dml_check(config, optype, execution_provider, quant_format):
+    if execution_provider != "DmlExecutionProvider":
+        return config
+
+    # only support per-tensor
+    if optype in ["Conv", "MatMul", "Mul", "Relu", "Clip", "MaxPool", "Add"]:
+        setattr(config, "per_channel", False)
+    return config
+
+
+def static_dnnl_check(config, optype, execution_provider, quant_format):
+    if execution_provider != "DnnlExecutionProvider":
+        return config
+
+    # current configurations are same as CPU EP
+    return static_cpu_check(config, optype, execution_provider, quant_format)
+
+
+def static_trt_check(config, optype, execution_provider, quant_format):
+    if execution_provider != "TensorrtExecutionProvider":
+        return config
+
+    # only support S8S8
+    if optype in ["Conv", "MatMul", "Gather", "Gemm"]:
+        setattr(config, "weight_type", onnx.TensorProto.INT8)
+        setattr(config, "weight_sym", True)
+        setattr(config, "activation_type", onnx.TensorProto.INT8)
+        setattr(config, "activation_sym", True)
+        setattr(config, "per_channel", [False, True])
+    else:
+        setattr(config, "weight_type", onnx.TensorProto.INT8)
+        setattr(config, "weight_sym", True)
+        setattr(config, "activation_type", onnx.TensorProto.INT8)
+        setattr(config, "activation_sym", True)
+    return config
+
+
+STATIC_CHECK_FUNC_LIST = [
+    static_basic_check,
+    static_cpu_check,
+    static_cuda_check,
+    static_dml_check,
+    static_dnnl_check,
+    static_trt_check,
+]
+
+
+def dynamic_basic_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider not in constants.DYNAMIC_OP_LIST_MAP:
+        raise ValueError(
+            "Unsupported execution_provider {}, only support {}.".format(
+                execution_provider, list(constants.DYNAMIC_OP_LIST_MAP.keys())
+            )
+        )
+
+    supported_optype = constants.DYNAMIC_OP_LIST_MAP[execution_provider]
+    if optype not in supported_optype:
+        raise ValueError(
+            "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)
+        )
+    return config
+
+
+def dynamic_cpu_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider != "CPUExecutionProvider":
+        return config
+    # TODO: add constraints for other EP
+    if optype in ["FusedConv", "Conv", "EmbedLayerNormalization", "Gather", "Attention", "LSTM"]:
+        setattr(config, "per_channel", False)
+    return config
+
+
+def dynamic_cuda_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider != "CUDAExecutionProvider":
+        return config
+    # current configurations are same as CPU EP
+    return dynamic_cpu_check(config, optype, execution_provider, quant_format)
+
+
+def dynamic_dml_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider != "DmlExecutionProvider":
+        return config
+
+    # don't support dynamic quantization
+    return None
+
+
+def dynamic_dnnl_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider != "DnnlExecutionProvider":
+        return config
+    # current configurations are same as CPU EP
+    return dynamic_cpu_check(config, optype, execution_provider, quant_format)
+
+
+def dynamic_trt_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider != "TensorrtExecutionProvider":
+        return config
+
+    # don't support dynamic quantization
+    return None
+
+
+DYNAMIC_CHECK_FUNC_LIST = [
+    dynamic_basic_check,
+    dynamic_cpu_check,
+    dynamic_cuda_check,
+    dynamic_dml_check,
+    dynamic_dnnl_check,
+    dynamic_trt_check,
+]
+
+
+@register_config(algo_name=constants.STATIC_QUANT, priority=constants.PRIORITY_STATIC_QUANT)
+class StaticQuantConfig(BaseConfig, ort_quant.StaticQuantConfig):
+
+    supported_configs: List[_OperatorConfig] = []
+    params_list: List[str] = [
+        "weight_type",
+        "activation_type",
+        "per_channel",
+        "weight_sym",
+        "activation_sym",
+        "calibrate_method",
+    ]
+    model_params_list: List[str] = [
+        "quant_format",
+        "reduce_range",
+        "use_external_data_format",
+        "calibration_sampling_size",
+        "quant_last_matmul",
+    ]
+    name: str = constants.STATIC_QUANT
+
+    def __init__(
+        self,
+        calibration_data_reader: data_reader.CalibrationDataReader = None,
+        calibrate_method=quantization.CalibrationMethod.MinMax,
+        quant_format=quantization.QuantFormat.QOperator,
+        activation_type=quantization.QuantType.QInt8,
+        weight_type=quantization.QuantType.QInt8,
+        op_types_to_quantize=None,
+        nodes_to_quantize=None,
+        nodes_to_exclude=None,
+        per_channel=False,
+        reduce_range=False,
+        use_external_data_format=False,
+        extra_options=None,
+        calibration_sampling_size=100,
+        quant_last_matmul=True,
+        execution_provider=None,
+        white_list: list = constants.DEFAULT_WHITE_LIST,
+        **kwargs,
+    ):
+        """This is a class for static Quant Configuration.
+
+        Inherit from StaticQuantConfig:
+        https://github.com/microsoft/onnxruntime/blob/v1.17.1/onnxruntime/python/tools/quantization/quantize.py#L78
+        extra_options:
+            Support smoothquant args.
+            - SmoothQuant = True/False :
+                Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
+                fake input channel quantization.
+            - SmoothQuantAlpha = float :
+                Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
+                and activation quantization. A larger alpha value could be used on models with more significant
+                activation outliers to migrate more quantization difficulty to weights.
+            - SmoothQuantFolding = True/False :
+                Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
+                SmoothQuant will be folded into the previous op if the previous op is foldable.
+            - SmoothQuantOpTypes = list (new args):
+                Default is ["Gemm", "Conv", "MatMul", "FusedConv"]. It only works if SmoothQuant is True.
+                It controls the op types to be smooth quantized.
+            - SmoothQuantCalibIter = int (new args):
+                Default is 100. It only works if SmoothQuant is True. It controls the iteration num for calibration.
+            - SmoothQuantScalesPerOp = True/False (new args) :
+                Default is True. It only works if SmoothQuant is True.
+                If enabled, each op will have an individual scale, mainlyfor accuracy.
+                If not enabled,  ops with the same input will share a scale, mainly for performance.
+        """
+        if execution_provider is None:
+            execution_provider = utility.auto_detect_ep()
+        if op_types_to_quantize is None:
+            op_types_to_quantize = (
+                constants.STATIC_QOPERATOR_OP_LIST_MAP.get(execution_provider, [])
+                if quant_format == quantization.QuantFormat.QOperator
+                else constants.STATIC_QDQ_OP_LIST_MAP.get(execution_provider, [])
+            )
+        if not reduce_range and not utility.CpuInfo().vnni:
+            logger.warning(
+                "VNNI is not supported and reduce_range=False, reduce_range=True is recommended to avoid potential accuracy issue."
+            )
+        ort_quant.StaticQuantConfig.__init__(
+            self,
+            calibration_data_reader=calibration_data_reader,
+            calibrate_method=calibrate_method,
+            quant_format=quant_format,
+            activation_type=activation_type,
+            weight_type=weight_type,
+            op_types_to_quantize=op_types_to_quantize,
+            nodes_to_quantize=nodes_to_quantize,
+            nodes_to_exclude=nodes_to_exclude,
+            per_channel=per_channel,
+            reduce_range=reduce_range,
+            use_external_data_format=use_external_data_format,
+            extra_options=extra_options,
+        )
+        # do not load TensorRT if backend is not TensorrtExecutionProvider
+        if "TensorrtExecutionProvider" in execution_provider:
+            logger.info("Update some parameters for TensorrtExecutionProvider")
+            os.environ["ORT_TENSORRT_INT8_ENABLE"] = "0"
+            self.extra_options.update(
+                {
+                    "AddQDQPairToWeight": True,
+                    "DedicatedQDQPair": True,
+                    "OpTypesToExcludeOutputQuantization": ["Conv", "Gemm", "Add", "MatMul"],
+                }
+            )
+        else:
+            os.environ["ORT_TENSORRT_UNAVAILABLE"] = "1"
+
+        BaseConfig.__init__(self, white_list=self.op_types_to_quantize)
+        self.execution_provider = execution_provider
+        self.quant_last_matmul = quant_last_matmul
+        self.calibration_sampling_size = calibration_sampling_size
+        _extra_options = ExtraOptions(**self.extra_options)
+        self.weight_sym = _extra_options.WeightSymmetric
+        self.activation_sym = _extra_options.ActivationSymmetric
+        self.optypes_to_exclude_output_quant = _extra_options.OpTypesToExcludeOutputQuantization
+        self.dedicated_qdq_pair = _extra_options.DedicatedQDQPair
+        self.add_qdq_pair_to_weight = _extra_options.AddQDQPairToWeight
+        self.white_list = white_list
+        self._post_init()
+
+    @staticmethod
+    def get_model_info(model, white_list=constants.STATIC_QOPERATOR_CPU_OP_LIST) -> list:
+        if not isinstance(model, onnx.ModelProto):
+            model = onnx.load(model, load_external_data=False)
+
+        filter_result = []
+        for node in model.graph.node:
+            if node.op_type in white_list:
+                pair = (node.name, node.op_type)
+                filter_result.append(pair)
+        logger.debug(f"Get model info: {filter_result}")
+        return filter_result
+
+    def get_model_params_dict(self):
+        result = dict()
+        for param in self.model_params_list:
+            result[param] = getattr(self, param)
+        return result
+
+    def _post_init(self):
+        for op_name_or_type in self.op_types_to_quantize:
+            params = self.get_params_dict()
+            op_config = OperatorConfig(**params)
+
+            for valid_func in STATIC_CHECK_FUNC_LIST:
+                op_config = valid_func(op_config, op_name_or_type, self.execution_provider, self.quant_format)
+            self.set_local(op_name_or_type, op_config)
+        if isinstance(self.white_list, list) and len(self.white_list) > 0:
+            for op_name_or_type in self.white_list:
+                global_config = self.get_init_args()
+                tmp_config = self.__class__(**global_config, white_list=None)
+                self.set_local(op_name_or_type, tmp_config)
+
+    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
+        if config_list is None:
+            config_list = [self]
+        for config in config_list:
+            # update model level setting
+            self._config_mapping.update(config.get_model_params_dict())
+
+            # update node level setting
+            global_config = config.global_config
+            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
+            last_matmul = None
+            for op_name, op_type in model_info:
+                if op_type == "MatMul":
+                    last_matmul = op_name
+                if (
+                    isinstance(self.op_types_to_quantize, list)
+                    and len(self.op_types_to_quantize) > 0
+                    and op_type not in self.op_types_to_quantize
+                ):
+                    continue
+                if (
+                    isinstance(self.nodes_to_quantize, list)
+                    and len(self.nodes_to_quantize) > 0
+                    and op_name not in self.nodes_to_quantize
+                ):
+                    continue
+                if (
+                    isinstance(self.nodes_to_exclude, list)
+                    and len(self.nodes_to_exclude) > 0
+                    and op_name in self.nodes_to_exclude
+                ):
+                    continue
+                if op_type in op_type_config_dict:
+                    self._config_mapping[op_name] = op_type_config_dict[op_type]
+                for op_name_pattern in op_name_config_dict:
+                    if re.match(op_name_pattern, op_name):
+                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
+
+        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
+            del self._config_mapping[last_matmul]
+        return self._config_mapping
+
+    @classmethod
+    def get_config_set_for_tuning(
+        cls,
+        quant_format=quantization.QuantFormat.QOperator,
+        execution_provider=None,
+        op_types_to_quantize=None,
+        nodes_to_exclude=None,
+        reduce_range=False,
+        use_external_data_format=False,
+        calibration_sampling_size=100,
+        quant_last_matmul=True,
+        **kwargs,
+    ) -> Union[None, "StaticQuantConfig", List["StaticQuantConfig"]]:  # pragma: no cover
+        if execution_provider is None:
+            execution_provider = utility.auto_detect_ep()
+        StaticQuantConfig.register_supported_configs()
+        if op_types_to_quantize is None:
+            op_types_to_quantize = (
+                constants.STATIC_QOPERATOR_OP_LIST_MAP.get(execution_provider, [])
+                if quant_format == quantization.QuantFormat.QOperator
+                else constants.STATIC_QDQ_OP_LIST_MAP.get(execution_provider, [])
+            )
+
+        op_type_candidate = [
+            op_types_to_quantize,
+            list(set(op_types_to_quantize).difference({"Add", "Mul"})),
+            list(set(op_types_to_quantize).difference({"Add", "Mul", "Gather", "GatherElements", "GatherND"})),
+            list(
+                set(op_types_to_quantize).difference(
+                    {"Add", "Mul", "Gather", "GatherElements", "GatherND", "Attention"}
+                )
+            ),
+        ]
+
+        cfg_lst = []
+        for item in op_type_candidate:
+            cfg_lst.append(
+                StaticQuantConfig(
+                    execution_provider=execution_provider,
+                    quant_format=quant_format,
+                    reduce_range=reduce_range,
+                    use_external_data_format=use_external_data_format,
+                    calibration_sampling_size=calibration_sampling_size,
+                    op_types_to_quantize=item,
+                    nodes_to_exclude=nodes_to_exclude,
+                    quant_last_matmul=[True, False],
+                    per_channel=[True, False],
+                    **kwargs,
+                )
+            )
+        return cfg_lst
+
+    @classmethod
+    def register_supported_configs(cls) -> None:
+        supported_configs = []
+        supported_configs.append(
+            _OperatorConfig(
+                config=OperatorConfig(
+                    weight_type=onnx.TensorProto.UINT8,
+                    weight_sym=False,
+                    per_channel=[True, False],
+                    calibrate_method=[
+                        quantization.CalibrationMethod.MinMax,
+                        quantization.CalibrationMethod.Entropy,
+                        quantization.CalibrationMethod.Percentile,
+                    ],
+                    activation_type=onnx.TensorProto.UINT8,
+                    activation_sym=False,
+                ),
+                operators=["GatherND", "GatherElements", "Gather"],
+                valid_func_list=STATIC_CHECK_FUNC_LIST,
+            )
+        )
+        supported_configs.append(
+            _OperatorConfig(
+                config=OperatorConfig(
+                    weight_type=onnx.TensorProto.UINT8,
+                    weight_sym=False,
+                    per_channel=False,
+                    calibrate_method=[
+                        quantization.CalibrationMethod.MinMax,
+                        quantization.CalibrationMethod.Entropy,
+                        quantization.CalibrationMethod.Percentile,
+                    ],
+                    activation_type=onnx.TensorProto.UINT8,
+                    activation_sym=False,
+                ),
+                operators=["EmbedLayerNormalization"],
+                valid_func_list=STATIC_CHECK_FUNC_LIST,
+            )
+        )
+        supported_configs.append(
+            _OperatorConfig(
+                config=OperatorConfig(
+                    weight_type=onnx.TensorProto.INT8,
+                    weight_sym=True,
+                    per_channel=[True, False],
+                    calibrate_method=[
+                        quantization.CalibrationMethod.MinMax,
+                        quantization.CalibrationMethod.Entropy,
+                        quantization.CalibrationMethod.Percentile,
+                    ],
+                    activation_type=onnx.TensorProto.UINT8,
+                    activation_sym=False,
+                ),
+                operators=["Conv", "MatMul", "Gemm", "FusedConv"],
+                valid_func_list=STATIC_CHECK_FUNC_LIST,
+            )
+        )
+        supported_configs.append(
+            _OperatorConfig(
+                config=OperatorConfig(
+                    weight_type=onnx.TensorProto.INT8,
+                    weight_sym=True,
+                    per_channel=False,
+                    calibrate_method=[
+                        quantization.CalibrationMethod.MinMax,
+                        quantization.CalibrationMethod.Entropy,
+                        quantization.CalibrationMethod.Percentile,
+                    ],
+                    activation_type=onnx.TensorProto.UINT8,
+                    activation_sym=False,
+                ),
+                operators=[
+                    "Relu",
+                    "Clip",
+                    "LeakyRelu",
+                    "Sigmoid",
+                    "MaxPool",
+                    "GlobalAveragePool",
+                    "Pad",
+                    "Split",
+                    "Squeeze",
+                    "Reshape",
+                    "Concat",
+                    "AveragePool",
+                    "Tile",
+                    "Unsqueeze",
+                    "Transpose",
+                    "Resize",
+                    "Abs",
+                    "Shrink",
+                    "Sign",
+                    "Attention",
+                    "Flatten",
+                    "Expand",
+                    "Slice",
+                    "Mod",
+                    "ReduceMax",
+                    "ReduceMin",
+                    "CenterCropPad",
+                    "Add",
+                    "Mul",
+                    "ArgMax",
+                ],
+                valid_func_list=STATIC_CHECK_FUNC_LIST,
+            )
+        )
+        cls.supported_configs = supported_configs
+
+    def to_dict(self):
+        result = {}
+        for key, val in self.__dict__.items():
+            if key in ["_global_config", "_config_mapping"]:
+                continue
+            if key == "_local_config":
+                local_result = {}
+                for name, cfg in val.items():
+                    local_result[name] = cfg.to_dict()
+                result[key] = local_result
+                continue
+            if not isinstance(val, list):
+                result[key] = (
+                    getattr(val, "tensor_type", val)
+                    if isinstance(val, quantization.QuantType)
+                    else getattr(val, "value", val)
+                )
+            else:
+                result[key] = [
+                    (
+                        getattr(item, "tensor_type", item)
+                        if isinstance(item, quantization.QuantType)
+                        else getattr(item, "value", item)
+                    )
+                    for item in val
+                ]
+        return result
+
+
+######################## SmoohQuant Config ###############################
+
+
+@register_config(algo_name=constants.SMOOTH_QUANT, priority=constants.PRIORITY_SMOOTH_QUANT)
+class SmoothQuantConfig(StaticQuantConfig):
+    """Smooth quant quantization config."""
+
+    supported_configs: List[_OperatorConfig] = []
+    params_list: List[str] = [
+        "weight_type",
+        "activation_type",
+        "per_channel",
+        "weight_sym",
+        "activation_sym",
+        "calibrate_method",
+    ]
+    model_params_list: List[str] = [
+        # smooth parameters
+        "alpha",
+        "folding",
+        "auto_alpha_args",
+        "calib_iter",
+        "scales_per_op",
+    ]
+    name: str = constants.SMOOTH_QUANT
+
+    def __init__(
+        self,
+        alpha: float = 0.5,
+        folding: bool = True,
+        op_types: List[str] = ["Gemm", "Conv", "MatMul", "FusedConv"],
+        calib_iter: int = 100,
+        scales_per_op: bool = True,
+        auto_alpha_args: dict = {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"},
+        white_list: list = None,
+        **kwargs,
+    ):
+        """Init smooth quant config.
+
+        Args:
+            alpha (float, optional): alpha value to balance the quantization difficulty of activation and weight.
+                Defaults to 0.5.
+            folding (bool, optional): whether fold those foldable Mul which are inserted for smooth quant.
+                Defaults to True.
+            op_types (list, optional): the op type to be smooth quantized.
+                Defaults to ["Gemm", "Conv", "MatMul", "FusedConv"].
+            calib_iter (int, optional): iteration num for calibration. Defaults to 100.
+            scales_per_op (bool, optional): True, each op will have an individual scale, mainlyfor accuracy.
+                False, ops with the same input will share a scale, mainly for performance. Defaults to True.
+            auto_alpha_args (dict, optional): settings for alpha tuning.
+                Defaults to {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}.
+            kwargs (dict): kwargs in below link are supported except calibration_data_reader:
+                https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/quantize.py#L78
+        """
+        super().__init__(white_list=white_list, **kwargs)
+        self.alpha = alpha
+        self.folding = folding
+        self.op_types = op_types
+        self.calib_iter = calib_iter
+        self.scales_per_op = scales_per_op
+        self.auto_alpha_args = auto_alpha_args
+
+    @classmethod
+    def register_supported_configs(cls) -> List[_OperatorConfig]:
+        supported_configs = []
+        smooth_quant_config = SmoothQuantConfig()
+        operators = ["Gemm", "Conv", "MatMul", "FusedConv"]
+        supported_configs.append(_OperatorConfig(config=smooth_quant_config, operators=operators))
+        cls.supported_configs = supported_configs
+
+    @staticmethod
+    def get_model_info(model, white_list=["Gemm", "Conv", "MatMul", "FusedConv"]) -> list:
+        if not isinstance(model, onnx.ModelProto):
+            model = onnx.load(model, load_external_data=False)
+
+        filter_result = []
+        for node in model.graph.node:
+            if node.op_type in white_list:
+                pair = (node.name, node.op_type)
+                filter_result.append(pair)
+        logger.debug(f"Get model info: {filter_result}")
+        return filter_result
+
+    @classmethod
+    def get_config_set_for_tuning(
+        cls,
+    ) -> Union[None, "SmoothQuantConfig", List["SmoothQuantConfig"]]:  # pragma: no cover
+        return SmoothQuantConfig(alpha=np.arange(0.3, 0.7, 0.05))
+
+
+def get_default_sq_config() -> SmoothQuantConfig:
+    """Generate the default smooth quant config.
+
+    Returns:
+        the default smooth quant config.
+    """
+    return SmoothQuantConfig()
+
+
+@register_config(algo_name=constants.DYNAMIC_QUANT, priority=constants.PRIORITY_DYNAMIC_QUANT)
+class DynamicQuantConfig(BaseConfig, ort_quant.DynamicQuantConfig):
+    """This is a class for dynamic Quant Configuration.
+
+    Inherit from DynamicQuantConfig:
+        https://github.com/microsoft/onnxruntime/blob/v1.17.1/onnxruntime/python/tools/quantization/quantize.py#L206
+    """
+
+    supported_configs: List[_OperatorConfig] = []
+    params_list: List[str] = [
+        "weight_type",
+        "activation_type",
+        "per_channel",
+        "weight_sym",
+        "activation_sym",
+    ]
+    model_params_list: List[str] = [
+        "reduce_range",
+        "use_external_data_format",
+        "quant_last_matmul",
+    ]
+    name: str = constants.DYNAMIC_QUANT
+
+    def __init__(
+        self,
+        weight_type: quantization.QuantType = quantization.QuantType.QInt8,
+        op_types_to_quantize: List[str] = None,
+        nodes_to_quantize: List[str] = None,
+        nodes_to_exclude: List[str] = None,
+        per_channel: bool = False,
+        reduce_range: bool = False,
+        use_external_data_format: bool = False,
+        extra_options: dict = None,
+        quant_last_matmul: bool = True,
+        execution_provider: str = None,
+        white_list: list = constants.DEFAULT_WHITE_LIST,
+        **kwargs,
+    ):
+        if execution_provider is None:
+            execution_provider = utility.auto_detect_ep()
+        if op_types_to_quantize is None:
+            op_types_to_quantize = constants.DYNAMIC_OP_LIST_MAP.get(execution_provider, [])
+        if not reduce_range and not utility.CpuInfo().vnni:
+            logger.warning(
+                "VNNI is not supported and reduce_range=False, reduce_range=True is recommended to avoid potential accuracy issue."
+            )
+        ort_quant.DynamicQuantConfig.__init__(
+            self,
+            weight_type=weight_type,
+            op_types_to_quantize=op_types_to_quantize,
+            nodes_to_quantize=nodes_to_quantize,
+            nodes_to_exclude=nodes_to_exclude,
+            per_channel=per_channel,
+            reduce_range=reduce_range,
+            use_external_data_format=use_external_data_format,
+            extra_options=extra_options,
+        )
+        BaseConfig.__init__(self, white_list=op_types_to_quantize)
+        self.execution_provider = execution_provider
+        self.quant_last_matmul = quant_last_matmul
+        self.activation_type = quantization.QuantType.QUInt8
+        _extra_options = ExtraOptions(**self.extra_options)
+        self.weight_sym = _extra_options.WeightSymmetric
+        self.activation_sym = _extra_options.ActivationSymmetric
+        self.white_list = white_list
+        self._post_init()
+
+    @staticmethod
+    def get_model_info(model, white_list=constants.DYNAMIC_CPU_OP_LIST) -> list:
+        if not isinstance(model, onnx.ModelProto):
+            model = onnx.load(model, load_external_data=False)
+
+        filter_result = []
+        for node in model.graph.node:
+            if node.op_type in white_list:
+                pair = (node.name, node.op_type)
+                filter_result.append(pair)
+        logger.debug(f"Get model info: {filter_result}")
+        return filter_result
+
+    def get_model_params_dict(self):
+        result = dict()
+        for param in self.model_params_list:
+            result[param] = getattr(self, param)
+        return result
+
+    def _post_init(self):
+        for op_name_or_type in self.op_types_to_quantize:
+            params = self.get_params_dict()
+            op_config = OperatorConfig(**params)
+            for valid_func in DYNAMIC_CHECK_FUNC_LIST:
+                op_config = valid_func(op_config, op_name_or_type, self.execution_provider)
+            self.set_local(op_name_or_type, op_config)
+        if isinstance(self.white_list, list) and len(self.white_list) > 0:
+            for op_name_or_type in self.white_list:
+                global_config = self.get_init_args()
+                tmp_config = self.__class__(**global_config, white_list=None)
+                self.set_local(op_name_or_type, tmp_config)
+
+    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
+        if config_list is None:
+            config_list = [self]
+        for config in config_list:
+            # update model level setting
+            self._config_mapping.update(config.get_model_params_dict())
+
+            # update node level setting
+            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
+            last_matmul = None
+            for op_name, op_type in model_info:
+                if op_type == "MatMul":
+                    last_matmul = op_name
+                if (
+                    isinstance(self.op_types_to_quantize, list)
+                    and len(self.op_types_to_quantize) > 0
+                    and op_type not in self.op_types_to_quantize
+                ):
+                    continue
+                if (
+                    isinstance(self.nodes_to_quantize, list)
+                    and len(self.nodes_to_quantize) > 0
+                    and op_name not in self.nodes_to_quantize
+                ):
+                    continue
+                if (
+                    isinstance(self.nodes_to_exclude, list)
+                    and len(self.nodes_to_exclude) > 0
+                    and op_name in self.nodes_to_exclude
+                ):
+                    continue
+                if op_type in op_type_config_dict:
+                    self._config_mapping[op_name] = op_type_config_dict[op_type]
+                for op_name_pattern in op_name_config_dict:
+                    if re.match(op_name_pattern, op_name):
+                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
+
+        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
+            del self._config_mapping[last_matmul]
+        return self._config_mapping
+
+    @classmethod
+    def get_config_set_for_tuning(
+        cls,
+        execution_provider=None,
+        op_types_to_quantize: List[str] = None,
+        nodes_to_exclude: List[str] = None,
+        reduce_range: bool = False,
+        use_external_data_format: bool = False,
+        quant_last_matmul: bool = True,
+    ) -> Union[None, "DynamicQuantConfig", List["DynamicQuantConfig"]]:  # pragma: no cover
+        if execution_provider is None:
+            execution_provider = utility.auto_detect_ep()
+        if op_types_to_quantize is None:
+            op_types_to_quantize = constants.DYNAMIC_OP_LIST_MAP.get(execution_provider, [])
+
+        op_type_candidate = [
+            op_types_to_quantize,
+            list(set(op_types_to_quantize).difference({"EmbedLayerNormalization", "Gather", "LSTM"})),
+            list(
+                set(op_types_to_quantize).difference({"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv"})
+            ),
+            list(
+                set(op_types_to_quantize).difference(
+                    {"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv", "Attention"}
+                )
+            ),
+            list(
+                set(op_types_to_quantize).difference(
+                    {"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv", "MatMul"}
+                )
+            ),
+        ]
+
+        cfg_lst = []
+        for item in op_type_candidate:
+            cfg_lst.append(
+                DynamicQuantConfig(
+                    execution_provider=execution_provider,
+                    op_types_to_quantize=item,
+                    nodes_to_exclude=nodes_to_exclude,
+                    reduce_range=reduce_range,
+                    use_external_data_format=use_external_data_format,
+                    quant_last_matmul=[True, False],
+                    per_channel=[True, False],
+                )
+            )
+        return cfg_lst
+
+    @classmethod
+    def register_supported_configs(cls) -> None:
+        supported_configs = []
+        supported_configs.append(
+            _OperatorConfig(
+                config=OperatorConfig(
+                    weight_type=onnx.TensorProto.UINT8,
+                    weight_sym=False,
+                    per_channel=False,
+                    activation_type=onnx.TensorProto.UINT8,
+                    activation_sym=False,
+                ),
+                operators=["FusedConv", "Conv", "EmbedLayerNormalization"],
+                valid_func_list=DYNAMIC_CHECK_FUNC_LIST,
+            )
+        )
+        supported_configs.append(
+            _OperatorConfig(
+                config=OperatorConfig(
+                    weight_type=onnx.TensorProto.INT8,
+                    weight_sym=True,
+                    per_channel=[True, False],
+                    activation_type=onnx.TensorProto.UINT8,
+                    activation_sym=False,
+                ),
+                operators=["MatMul"],
+                valid_func_list=DYNAMIC_CHECK_FUNC_LIST,
+            )
+        )
+        supported_configs.append(
+            _OperatorConfig(
+                config=OperatorConfig(
+                    weight_type=onnx.TensorProto.INT8,
+                    weight_sym=True,
+                    per_channel=False,
+                    activation_type=onnx.TensorProto.UINT8,
+                    activation_sym=False,
+                ),
+                operators=["Gather", "Attention", "LSTM"],
+                valid_func_list=DYNAMIC_CHECK_FUNC_LIST,
+            )
+        )
+        cls.supported_configs = supported_configs
+
+    def to_dict(self):
+        result = {}
+        for key, val in self.__dict__.items():
+            if key in ["_global_config", "_config_mapping"]:
+                continue
+            if key == "_local_config":
+                local_result = {}
+                for name, cfg in val.items():
+                    local_result[name] = cfg.to_dict()
+                result[key] = local_result
+                continue
+            if not isinstance(val, list):
+                result[key] = (
+                    getattr(val, "tensor_type", val)
+                    if isinstance(val, quantization.QuantType)
+                    else getattr(val, "value", val)
+                )
+            else:
+                result[key] = [
+                    (
+                        getattr(item, "tensor_type", item)
+                        if isinstance(item, quantization.QuantType)
+                        else getattr(item, "value", item)
+                    )
+                    for item in val
+                ]
+        return result
+
+
+##################### NC Algo Configs End ###################################
+
+register_supported_configs()
diff --git a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
index 62a671fba..41c58a29f 100644
--- a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
+++ b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
@@ -15,7 +15,7 @@
 from typing import List, Union  # isort: skip
 
 import onnx
-from onnxruntime.quantization import matmul_4bits_quantizer
+import onnxruntime as ort
 
 from onnx_neural_compressor.quantization import matmul_nbits_quantizer
 
@@ -33,8 +33,9 @@ def __init__(
         is_symmetric: bool = False,
         accuracy_level: int = 0,
         nodes_to_exclude=None,
-        algo_config: matmul_4bits_quantizer.WeightOnlyQuantConfig = None,
+        algo_config: matmul_nbits_quantizer.WeightOnlyQuantConfig = None,
         providers: List[str] = ["CPUExecutionProvider"],
+        optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
     ):
         super().__init__(
             model=model,
@@ -45,4 +46,5 @@ def __init__(
             algo_config=algo_config,
             n_bits=4,
             providers=providers,
+            optimization_level=optimization_level,
         )
diff --git a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
index 0d00bbbc5..ea77b18de 100644
--- a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
+++ b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
@@ -14,21 +14,41 @@
 
 from typing import List, Union  # isort: skip
 
+import pathlib
+import tempfile
+
 import onnx
-from onnxruntime.quantization import matmul_4bits_quantizer
+import onnxruntime as ort
 
-from onnx_neural_compressor import config, data_reader, logger, onnx_model, utility
+from onnx_neural_compressor import data_reader, logger, onnx_model, utility
 from onnx_neural_compressor.quantization import algorithm_entry as algos
+from onnx_neural_compressor.quantization import config
+
+
+class WeightOnlyQuantConfig:
+    def __init__(self, algorithm):
+        """This is the Base class for Weight Only Quant Configuration.
 
+        Args:
+            algorithm:
+                weight only quantize algorithm name.
+        """
+        self.algorithm = algorithm
 
-class RTNWeightOnlyQuantConfig(matmul_4bits_quantizer.RTNWeightOnlyQuantConfig):
+
+class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
 
     def __init__(self, ratios=None, layer_wise_quant=False):
-        super().__init__(ratios=ratios)
+        super().__init__(
+            algorithm="RTN",
+        )
+        if ratios is None:
+            ratios = {}
+        self.ratios = ratios
         self.layer_wise_quant = layer_wise_quant
 
 
-class GPTQWeightOnlyQuantConfig(matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig):
+class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
 
     def __init__(
         self,
@@ -41,17 +61,18 @@ def __init__(
         layer_wise_quant=False,
     ):
         super().__init__(
-            calibration_data_reader=calibration_data_reader,
-            percdamp=percdamp,
-            block_size=block_size,
-            actorder=actorder,
-            mse=mse,
-            perchannel=perchannel,
+            algorithm="GPTQ",
         )
+        self.calibration_data_reader = calibration_data_reader
+        self.percdamp = percdamp
+        self.block_size = block_size
+        self.actorder = actorder
+        self.mse = mse
+        self.perchannel = perchannel
         self.layer_wise_quant = layer_wise_quant
 
 
-class AWQWeightOnlyQuantConfig(matmul_4bits_quantizer.WeightOnlyQuantConfig):
+class AWQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
 
     def __init__(
         self,
@@ -81,15 +102,14 @@ def __init__(
         is_symmetric: bool = False,
         accuracy_level: int = 0,
         nodes_to_exclude: List[str] = None,
-        algo_config: matmul_4bits_quantizer.WeightOnlyQuantConfig = None,
+        algo_config: WeightOnlyQuantConfig = None,
         n_bits: int = 4,
         providers: List[str] = ["CPUExecutionProvider"],
+        optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
     ):
         if nodes_to_exclude is None:
             nodes_to_exclude = []
-        self.model_path = model if isinstance(model, str) else None
         self.model = model
-        self.model = onnx_model.ONNXModel(onnx.load(model)) if isinstance(model, str) else onnx_model.ONNXModel(model)
         self.block_size = block_size
         self.is_symmetric = is_symmetric
         self.accuracy_level = accuracy_level
@@ -98,6 +118,7 @@ def __init__(
         self.n_bits = n_bits
         self.providers = providers
         self.algorithm = self.algo_config.algorithm
+        self.optimization_level = optimization_level
         assert self.algorithm in [
             "RTN",
             "AWQ",
@@ -106,7 +127,6 @@ def __init__(
 
     def _generate_nc_config(self):
         config_class = config.config_registry.get_cls_configs()[self.algorithm.lower()]
-
         quant_kwargs = {
             "weight_bits": self.n_bits,
             "weight_group_size": self.block_size,
@@ -124,7 +144,7 @@ def _generate_nc_config(self):
             quant_kwargs.update(
                 {
                     "percdamp": self.algo_config.percdamp,
-                    "blocksize": self.algo_config.block_size,
+                    "block_size": self.algo_config.block_size,
                     "actorder": self.algo_config.actorder,
                     "mse": self.algo_config.mse,
                     "perchannel": self.algo_config.perchannel,
@@ -148,9 +168,33 @@ def _generate_nc_config(self):
 
     def int4_quant_algo(self):
         qconfig = self._generate_nc_config()
+        model = self.model
+        opt_tmp_file = tempfile.TemporaryDirectory()
+
+        # do graph optimization if not layer_wise_quant
+        if (
+            not getattr(self.algo_config, "layer_wise_quant", False)
+            and self.optimization_level != ort.GraphOptimizationLevel.ORT_DISABLE_ALL
+        ):
+            if not isinstance(model, str):
+                onnx.save(model, pathlib.Path(opt_tmp_file.name).joinpath("tmp.onnx").as_posix())
+                model = pathlib.Path(opt_tmp_file.name).joinpath("tmp.onnx").as_posix()
+            logger.info("Start graph optimization...")
+            sess_options = ort.SessionOptions()
+            sess_options.graph_optimization_level = self.optimization_level
+            sess_options.optimized_model_filepath = pathlib.Path(opt_tmp_file.name).joinpath("opt.onnx").as_posix()
+            sess_options.add_session_config_entry(
+                "session.optimized_model_external_initializers_file_name", "opt.onnx_data"
+            )
+            sess_options.add_session_config_entry(
+                "session.optimized_model_external_initializers_min_size_in_bytes", "1024"
+            )
+            session = ort.InferenceSession(model, sess_options)
+            model = sess_options.optimized_model_filepath
+            del session
+            logger.info("Graph optimization done.")
 
         logger.info(f"start to quantize model with {self.algorithm} algorithm...")
-        model = self.model_path or self.model
         if self.algorithm == "RTN":
             self.model = algos.rtn_quantize_entry(model, qconfig)
         elif self.algorithm == "GPTQ":
@@ -158,6 +202,7 @@ def int4_quant_algo(self):
         elif self.algorithm == "AWQ":
             self.model = algos.awq_quantize_entry(model, qconfig, self.algo_config.calibration_data_reader)
         logger.info(f"complete quantization of model with {self.algorithm} algorithm.")
+        opt_tmp_file.cleanup()
 
     def process(self):
         self.int4_quant_algo()
diff --git a/onnx_neural_compressor/quantization/quant_utils.py b/onnx_neural_compressor/quantization/quant_utils.py
new file mode 100644
index 000000000..2d5518857
--- /dev/null
+++ b/onnx_neural_compressor/quantization/quant_utils.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2023 MIT HAN Lab
+# This source code is licensed under the MIT license
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+
+import onnx
+
+
+class QuantType(enum.Enum):  # pragma: no cover
+    """Represent QuantType value."""
+
+    QInt8 = 0
+    QUInt8 = 1
+
+    @property
+    def tensor_type(self):
+        if self == QuantType.QInt8:
+            return onnx.TensorProto.INT8
+        if self == QuantType.QUInt8:
+            return onnx.TensorProto.UINT8
+        raise ValueError(f"Unexpected value qtype={self!r}.")
+
+
+class QuantFormat(enum.Enum):
+    QOperator = 0
+    QDQ = 1
+
+
+class CalibrationMethod(enum.Enum):
+    MinMax = 0
+    Entropy = 1
+    Percentile = 2
+    Distribution = 3
diff --git a/onnx_neural_compressor/quantization/quantize.py b/onnx_neural_compressor/quantization/quantize.py
index 7e388e3aa..9fb3dfd41 100644
--- a/onnx_neural_compressor/quantization/quantize.py
+++ b/onnx_neural_compressor/quantization/quantize.py
@@ -13,32 +13,51 @@
 # limitations under the License.
 
 import pathlib
+import tempfile
 from typing import Union
 
 import onnx
+import onnxruntime as ort
 from onnxruntime.quantization.quantize import QuantConfig
 
-from onnx_neural_compressor import config
 from onnx_neural_compressor.quantization import algorithm_entry as algos
+from onnx_neural_compressor.quantization import config
 
 
 # ORT-like user-facing API
 def quantize(
     model_input: Union[str, pathlib.Path, onnx.ModelProto],
     model_output: Union[str, pathlib.Path],
-    quant_config: QuantConfig,
+    quant_config: config.BaseConfig,
+    optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
 ):
-    if isinstance(quant_config, config.StaticQuantConfig):
-        if quant_config.extra_options.get("SmoothQuant", False):
-            nc_sq_config = config.generate_nc_sq_config(quant_config)
-            algos.smooth_quant_entry(
-                model_input, nc_sq_config, quant_config.calibration_data_reader, model_output=model_output
+    with tempfile.TemporaryDirectory(prefix="ort.opt.") as tmp_dir:
+        if optimization_level != ort.GraphOptimizationLevel.ORT_DISABLE_ALL:
+            sess_options = ort.SessionOptions()
+            sess_options.graph_optimization_level = optimization_level
+            sess_options.optimized_model_filepath = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix()
+            sess_options.add_session_config_entry(
+                "session.optimized_model_external_initializers_file_name", "opt.onnx_data"
             )
+            sess_options.add_session_config_entry(
+                "session.optimized_model_external_initializers_min_size_in_bytes", "1024"
+            )
+            session = ort.InferenceSession(model_input, sess_options)
+            del session
+            model_input = sess_options.optimized_model_filepath
+
+        if isinstance(quant_config, config.StaticQuantConfig):
+            if quant_config.extra_options.get("SmoothQuant", False):
+                algos.smooth_quant_entry(
+                    model_input, quant_config, quant_config.calibration_data_reader, model_output=model_output
+                )
+            else:
+                algos.static_quantize_entry(
+                    model_input, quant_config, quant_config.calibration_data_reader, model_output=model_output
+                )
+        elif isinstance(quant_config, config.DynamicQuantConfig):
+            algos.dynamic_quantize_entry(model_input, quant_config, model_output=model_output)
         else:
-            # call static_quant_entry
-            pass
-    elif isinstance(quant_config, config.DynamicQuantConfig):
-        # call dynamic_quant_entry
-        pass
-    else:
-        raise TypeError("Invalid quantization config type, it must be either StaticQuantConfig or DynamicQuantConfig.")
+            raise TypeError(
+                "Invalid quantization config type, it must be either StaticQuantConfig or DynamicQuantConfig."
+            )
diff --git a/onnx_neural_compressor/quantization/tuning.py b/onnx_neural_compressor/quantization/tuning.py
index a6743ad7a..5bf2d95d4 100644
--- a/onnx_neural_compressor/quantization/tuning.py
+++ b/onnx_neural_compressor/quantization/tuning.py
@@ -15,12 +15,17 @@
 import copy
 import os
 import pathlib
+import shutil
 import tempfile
+import traceback
 import uuid
 
 import onnx
+import onnxruntime as ort
+from onnx import external_data_helper
 
-from onnx_neural_compressor import config, data_reader, logger, utility
+from onnx_neural_compressor import data_reader, logger, utility
+from onnx_neural_compressor.quantization import config
 
 from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Sized, Tuple, Union  # isort: skip
 
@@ -99,7 +104,9 @@ def _set_eval_fn_registry(self, user_eval_fns: List[Dict]) -> None:
             {
                 self.EVAL_FN: user_eval_fn_pair[self.EVAL_FN],
                 self.WEIGHT: user_eval_fn_pair.get(self.WEIGHT, 1.0),
-                self.FN_NAME: user_eval_fn_pair.get(self.FN_NAME, user_eval_fn_pair[self.EVAL_FN].__name__),
+                self.FN_NAME: user_eval_fn_pair.get(
+                    self.FN_NAME, getattr(user_eval_fn_pair[self.EVAL_FN], "__name__", "custom_func")
+                ),
             }
             for user_eval_fn_pair in user_eval_fns
         ]
@@ -224,13 +231,29 @@ def __len__(self) -> int:
 
 class ConfigLoader:
 
-    def __init__(self, config_set: ConfigSet, sampler: Sampler = default_sampler) -> None:
+    def __init__(
+        self, config_set: ConfigSet, sampler: Sampler = default_sampler, skip_verified_config: bool = True
+    ) -> None:
         self.config_set = ConfigSet.from_fwk_configs(config_set)
         self._sampler = sampler(self.config_set)
+        self.skip_verified_config = skip_verified_config
+        self.verify_config_list = list()
+
+    def is_verified_config(self, config):
+        for verified_config in self.verify_config_list:
+            if config == verified_config:
+                return True
+        return False
 
     def __iter__(self) -> Generator[config.BaseConfig, Any, None]:
         for index in self._sampler:
-            yield self.config_set[index]
+            new_config = self.config_set[index]
+            if self.skip_verified_config and self.is_verified_config(new_config):
+                logger.debug("Skip the verified config:")
+                logger.debug(new_config.to_dict())
+                continue
+            self.verify_config_list.append(new_config)
+            yield new_config
 
 
 class TuningConfig:
@@ -317,13 +340,13 @@ def set_baseline(self, baseline: float):
     def get_number_of_trials(self):
         return len(self.tuning_history)
 
-    def get_best_quant_config(self) -> config.BaseConfig:
-        assert self.get_number_of_trials() > 0, "No trial record in tuning monitor."
-        # Put the record with a higher score at the beginning
-        sorted_trials_records: List[_TrialRecord] = sorted(
-            self.tuning_history, key=lambda x: x.trial_result, reverse=True
-        )
-        return sorted_trials_records[0].quant_config
+    def need_skip(self, config) -> bool:
+        """Check whether the expanded quant config is verified."""
+        if len(self.tuning_history) > 0 and any([config == i.quant_config.config_mapping for i in self.tuning_history]):
+            logger.warning("Skip the verified config mapping.")
+            logger.debug(config)
+            return True
+        return False
 
     def need_stop(self) -> bool:
         """Check if need to stop tuning. Either accuracy goal is met, max trials is reached or timeout is reached.
@@ -343,6 +366,12 @@ def need_stop(self) -> bool:
         # [-1] is the last element representing the latest trail record.
         return reach_max_trials or meet_accuracy_goal
 
+    def print_config_diff(self, config):
+        if len(self.tuning_history) == 0:
+            logger.info("quant config: {}".format(config))
+        else:
+            logger.info("quant config difference: {}".format(config.get_diff_dict(self.tuning_history[0].quant_config)))
+
 
 class TuningLogger:
     """A unified logger for the tuning/quantization process.
@@ -398,8 +427,6 @@ def _need_apply(quant_config: config.BaseConfig, algo_name):
     return quant_config.name == algo_name if hasattr(quant_config, "name") else False
 
 
-# * only for internal usage now
-@utility.log_quant_execution
 def _quantize(
     model_input: Union[pathlib.Path, str],
     quant_config: config.BaseConfig,
@@ -424,7 +451,7 @@ def _quantize(
         assert isinstance(
             quant_config, config.BaseConfig
         ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}."
-    logger.info(f"Quantize model with config: \n {quant_config} \n")
+    logger.debug(f"Quantize model with config: \n {quant_config} \n")
 
     # select quantization algo according to config
     q_model = None
@@ -441,6 +468,7 @@ def autotune(
     eval_fn: Callable,
     eval_args: Optional[Tuple[Any]] = None,
     calibration_data_reader: data_reader.CalibrationDataReader = None,
+    optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
 ) -> Union[None, onnx.ModelProto]:
     """The main entry of auto-tune.
 
@@ -455,63 +483,104 @@ def autotune(
             During evaluation, autotune will only pass model path as the input of function.
         eval_args (Optional[Tuple[Any]]): evaluate arguments.
             Positional arguments for `eval_fn`.
-
         calibration_data_reader (data_reader.CalibrationDataReader): dataloader for calibration.
+        optimization_level (onnxruntime.GraphOptimizationLevel): graph optimization level.
+            Support ORT_DISABLE_ALL, ORT_ENABLE_ALL, ORT_ENABLE_BASIC, ORT_ENABLE_EXTENDED. Default is ORT_ENABLE_BASIC.
+            Details: https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html#onlineoffline-mode
     """
     best_quant_model = None
     eval_func_wrapper = EvaluationFuncWrapper(eval_fn, eval_args)
     config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config)
+    tmp_folder = tempfile.TemporaryDirectory()
+    pathlib.Path(tmp_folder.name).joinpath("./eval").mkdir()
+    if optimization_level != ort.GraphOptimizationLevel.ORT_DISABLE_ALL:
+        sess_options = ort.SessionOptions()
+        sess_options.graph_optimization_level = optimization_level
+        sess_options.optimized_model_filepath = pathlib.Path(tmp_folder.name).joinpath("model.onnx").as_posix()
+        sess_options.add_session_config_entry(
+            "session.optimized_model_external_initializers_file_name", "model.onnx_data"
+        )
+        sess_options.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "1024")
+        session = ort.InferenceSession(model_input, sess_options)
+
+        # copy config.json to tmp dir for evaluation, LLMs evaluation may need it
+        if isinstance(model_input, str) and os.path.exists(
+            pathlib.Path(model_input).parent.joinpath("config.json").as_posix()
+        ):
+            shutil.copyfile(
+                pathlib.Path(model_input).parent.joinpath("config.json").as_posix(),
+                pathlib.Path(tmp_folder.name).joinpath("config.json").as_posix(),
+            )
+
+        model_input = sess_options.optimized_model_filepath
+        del session
+
     try:
         baseline: float = eval_func_wrapper.evaluate(model_input)
     except Exception as e:
-        print(e)
         if "'str' object has no attribute 'SerializeToString'" in str(e):
             logger.warning("Please refine your eval_fn to accept model path (str) as input.")
+        if "Unable to load from type '<class 'onnx.onnx_ml_pb2.ModelProto'>'" in str(e):
+            logger.warning("Please pass model path to autotune API rather than onnx.ModelProto.")
+        print(traceback.format_exc())
         exit(0)
+
     tuning_monitor.set_baseline(baseline)
     tuning_logger.tuning_start()
     for trial_index, quant_config in enumerate(config_loader):
+        # check whether config_mapping is verified
+        model_info = quant_config.__class__.get_model_info(model=model_input)
+        config_mapping = quant_config.to_config_mapping(model_info=model_info)
+        if tuning_monitor.need_skip(config_mapping):
+            continue
+
         if calibration_data_reader is not None:
             calibration_data_reader.rewind()
+
         tuning_logger.trial_start(trial_index=trial_index)
         tuning_logger.quantization_start()
-        logger.debug("quant config: {}".format(quant_config))
+        tuning_monitor.print_config_diff(quant_config)
         q_model = _quantize(model_input, quant_config=quant_config, calibration_data_reader=calibration_data_reader)
         tuning_logger.quantization_end()
         tuning_logger.evaluation_start()
-        with tempfile.TemporaryDirectory(prefix="ort.quant.") as tmp_dir:
-            # evaluate API requires str input
-            onnx.save_model(
-                q_model,
-                pathlib.Path(tmp_dir).joinpath(pathlib.Path(model_input).name).as_posix(),
-                save_as_external_data=True,
-                all_tensors_to_one_file=True,
-                location=pathlib.Path(model_input).with_suffix(pathlib.Path(model_input).suffix + "_data").name,
-                size_threshold=1024,
-                convert_attribute=False,
-            )
-            # copy config.json to tmp dir for evaluation, LLMs evaluation may need it
-            if isinstance(model_input, str) and os.path.exists(
-                pathlib.Path(model_input).parent.joinpath("config.json").as_posix()
-            ):
-                import shutil
-
-                shutil.copyfile(
-                    pathlib.Path(model_input).parent.joinpath("config.json").as_posix(),
-                    pathlib.Path(tmp_dir).joinpath("config.json").as_posix(),
-                )
-            eval_result: float = eval_func_wrapper.evaluate(
-                pathlib.Path(tmp_dir).joinpath(pathlib.Path(model_input).name).as_posix()
+        # evaluate API requires str input
+        onnx.save_model(
+            q_model,
+            pathlib.Path(tmp_folder.name).joinpath("./eval/model.onnx").as_posix(),
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            size_threshold=1024,
+            convert_attribute=False,
+        )
+        # copy config.json to tmp dir for evaluation, LLMs evaluation may need it
+        if isinstance(model_input, str) and os.path.exists(
+            pathlib.Path(model_input).parent.joinpath("config.json").as_posix()
+        ):
+            shutil.copyfile(
+                pathlib.Path(model_input).parent.joinpath("config.json").as_posix(),
+                pathlib.Path(tmp_folder.name).joinpath("./eval/config.json").as_posix(),
             )
+        eval_result: float = eval_func_wrapper.evaluate(
+            pathlib.Path(tmp_folder.name).joinpath("./eval/model.onnx").as_posix()
+        )
         tuning_logger.evaluation_end()
         logger.info("Evaluation result: %.4f", eval_result)
         tuning_monitor.add_trial_result(trial_index, eval_result, quant_config)
         tuning_logger.trial_end(trial_index)
         if tuning_monitor.need_stop():
-            best_quant_config: config.BaseConfig = tuning_monitor.get_best_quant_config()
-            best_quant_model = _quantize(
-                model_input, quant_config=best_quant_config, calibration_data_reader=calibration_data_reader
+            external_data_helper.load_external_data_for_model(
+                q_model, pathlib.Path(tmp_folder.name).joinpath("./eval").as_posix()
             )
+            best_quant_model = q_model
             break
+
     tuning_logger.tuning_end()
+    if best_quant_model is None:
+        logger.info(
+            "Don't find the quantized model which meets accuracy requirement. "
+            "Please try other configs or adjust tolerable_loss."
+        )
+        exit(0)
+
+    tmp_folder.cleanup()
     return best_quant_model
diff --git a/onnx_neural_compressor/utility.py b/onnx_neural_compressor/utility.py
index cc36b6e8a..8bea213b5 100644
--- a/onnx_neural_compressor/utility.py
+++ b/onnx_neural_compressor/utility.py
@@ -22,8 +22,9 @@
 import cpuinfo
 import numpy as np
 import onnx
+import onnxruntime as ort
+import prettytable as pt
 import psutil
-from onnxruntime.quantization import onnx_model
 
 from onnx_neural_compressor import constants, logger
 
@@ -75,35 +76,20 @@ class Options:
     This class is used for configuring global variables. The global variable options is created with this class.
     If you want to change global variables, you should use functions from onnx_neural_compressor.utility.py:
         set_random_seed(seed: int)
-        set_workspace(workspace: str)
-        set_resume_from(resume_from: str)
 
     Args:
         random_seed(int): Random seed used in neural compressor.
                           Default value is 1978.
-        workspace(str): The directory where intermediate files and tuning history file are stored.
-                        Default value is:
-                            "./nc_workspace/{}/".format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")).
-        resume_from(str): The directory you want to resume tuning history file from.
-                          The tuning history was automatically saved in the workspace directory
-                               during the last tune process.
-                          Default value is None.
 
     Example::
 
         from onnx_neural_compressor import set_random_seed
-        from onnx_neural_compressor import set_workspace
-        from onnx_neural_compressor import set_resume_from
         set_random_seed(2022)
-        set_workspace("workspace_path")
-        set_resume_from("workspace_path")
     """
 
-    def __init__(self, random_seed=1978, workspace=constants.DEFAULT_WORKSPACE, resume_from=None):
+    def __init__(self, random_seed=1978):
         """Init an Option object."""
         self.random_seed = random_seed
-        self.workspace = workspace
-        self.resume_from = resume_from
 
     @property
     def random_seed(self):
@@ -116,71 +102,10 @@ def random_seed(self, random_seed):
         if check_value("random_seed", random_seed, int):
             self._random_seed = random_seed
 
-    @property
-    def workspace(self):
-        """Get workspace."""
-        return self._workspace
-
-    @workspace.setter
-    def workspace(self, workspace):
-        """Set workspace."""
-        if check_value("workspace", workspace, str):
-            self._workspace = workspace
-
-    @property
-    def resume_from(self):
-        """Get resume_from."""
-        return self._resume_from
-
-    @resume_from.setter
-    def resume_from(self, resume_from):
-        """Set resume_from."""
-        if resume_from is None or check_value("resume_from", resume_from, str):
-            self._resume_from = resume_from
-
 
 options = Options()
 
 
-class TuningLogger:
-    """A unified logger for the tuning/quantization process.
-
-    It assists validation teams in retrieving logs.
-    """
-
-    @classmethod
-    def tuning_start(cls) -> None:
-        logger.info("Tuning started.")
-
-    @classmethod
-    def trial_start(cls, trial_index: int = None) -> None:
-        logger.info("%d-trail started.", trial_index)
-
-    @classmethod
-    def quantization_start(cls, stacklevel=2) -> None:
-        logger.info("Quantization started.", stacklevel=stacklevel)
-
-    @classmethod
-    def quantization_end(cls, stacklevel=2) -> None:
-        logger.info("Quantization end.", stacklevel=stacklevel)
-
-    @classmethod
-    def evaluation_start(cls) -> None:
-        logger.info("Evaluation started.")
-
-    @classmethod
-    def evaluation_end(cls) -> None:
-        logger.info("Evaluation end.")
-
-    @classmethod
-    def trial_end(cls, trial_index: int = None) -> None:
-        logger.info("%d-trail end.", trial_index)
-
-    @classmethod
-    def tuning_end(cls) -> None:
-        logger.info("Tuning completed.")
-
-
 def singleton(cls):
     """Singleton decorator."""
 
@@ -195,6 +120,48 @@ def _singleton(*args, **kw):
     return _singleton
 
 
+class Statistics:
+    """The statistics printer."""
+
+    def __init__(self, data, header, field_names, output_handle=logger.info):
+        """Init a Statistics object.
+
+        Args:
+            data: The statistics data
+            header: The table header
+            field_names: The field names
+            output_handle: The output logging method
+        """
+        self.field_names = field_names
+        self.header = header
+        self.data = data
+        self.output_handle = output_handle
+        self.tb = pt.PrettyTable(min_table_width=40)
+
+    def print_stat(self):
+        """Print the statistics."""
+        valid_field_names = []
+        for index, value in enumerate(self.field_names):
+            if index < 2:
+                valid_field_names.append(value)
+                continue
+
+            if any(i[index] for i in self.data):
+                valid_field_names.append(value)
+        self.tb.field_names = valid_field_names
+        for i in self.data:
+            tmp_data = []
+            for index, value in enumerate(i):
+                if self.field_names[index] in valid_field_names:
+                    tmp_data.append(value)
+            if any(tmp_data[1:]):
+                self.tb.add_row(tmp_data)
+        lines = self.tb.get_string().split("\n")
+        self.output_handle("|" + self.header.center(len(lines[0]) - 2, "*") + "|")
+        for i in lines:
+            self.output_handle(i)
+
+
 class LazyImport(object):
     """Lazy import python module till use."""
 
@@ -296,96 +263,11 @@ def get_number_of_sockets(self) -> int:
         return 0
 
 
-def dump_elapsed_time(customized_msg=""):
-    """Get the elapsed time for decorated functions.
-
-    Args:
-        customized_msg (string, optional): The parameter passed to decorator. Defaults to None.
-    """
-
-    def f(func):
-
-        def fi(*args, **kwargs):
-            start = time.time()
-            res = func(*args, **kwargs)
-            end = time.time()
-            logger.info(
-                "%s elapsed time: %s ms"
-                % (customized_msg if customized_msg else func.__qualname__, round((end - start) * 1000, 2))
-            )
-            return res
-
-        return fi
-
-    return f
-
-
 def set_random_seed(seed: int):
     """Set the random seed in config."""
     options.random_seed = seed
 
 
-def set_workspace(workspace: str):
-    """Set the workspace in config."""
-    options.workspace = workspace
-
-
-def set_resume_from(resume_from: str):
-    """Set the resume_from in config."""
-    options.resume_from = resume_from
-
-
-def log_quant_execution(func):
-    default_tuning_logger = TuningLogger()
-
-    def wrapper(*args, **kwargs):
-        default_tuning_logger.quantization_start(stacklevel=4)
-
-        # Call the original function
-        result = func(*args, **kwargs)
-
-        default_tuning_logger.quantization_end(stacklevel=4)
-        return result
-
-    return wrapper
-
-
-dtype_mapping = {
-    "fp32": 1,
-    "float32": 1,
-    "uint8": 2,
-    "int8": 3,
-    "uint16": 4,
-    "int16": 5,
-    "int32": 6,
-    "int64": 7,
-    "string": 8,
-    "bool": 9,
-    "fp16": 10,
-    "float16": 10,
-    "double": 11,
-    "uint32": 12,
-    "uint64": 13,
-    "complex64": 14,
-    "complex128": 15,
-    "bf16": 16,
-    "bfloat16": 16,
-}
-
-
-def find_by_name(name, item_list):
-    """Helper function to find item by name in a list."""
-    items = []
-    for item in item_list:
-        assert hasattr(item, "name"), "{} should have a 'name' attribute defined".format(item)  # pragma: no cover
-        if item.name == name:
-            items.append(item)
-    if len(items) > 0:
-        return items[0]
-    else:
-        return None
-
-
 def simple_progress_bar(total, i):
     """Progress bar for cases where tqdm can't be used."""
     progress = i / total
@@ -419,157 +301,26 @@ def decorator(algo_func):
     return decorator
 
 
-def get_model_info(
-    model: Union[onnx.ModelProto, pathlib.Path, str], white_op_type_list: List[Callable]
-) -> List[Tuple[str, Callable]]:
-    if not isinstance(model, onnx.ModelProto):
-        model = onnx.load(model)
-    filter_result = []
-    filter_result_set = set()
-    for node in model.graph.node:
-        if node.op_type in white_op_type_list:
-            pair = (node.name, node.op_type)
-            if pair not in filter_result_set:
-                filter_result_set.add(pair)
-                filter_result.append(pair)
-    logger.debug(f"Get model info: {filter_result}")
-    return filter_result
-
-
-def is_B_transposed(node):
-    """Whether inuput B is transposed."""
-    transB = [attr for attr in node.attribute if attr.name == "transB"]
-    if len(transB):
-        return 0 < onnx.helper.get_attribute_value(transB[0])
-    return False
-
-
-def get_qrange_for_qType(qType, reduce_range=False):
-    """Helper function to get the quantization range for a type.
-
-    Args:
-        qType (int): data type
-        reduce_range (bool, optional): use 7 bit or not. Defaults to False.
-    """
-    if qType == onnx.onnx_pb.TensorProto.UINT8:
-        return 127 if reduce_range else 255
-    elif qType == onnx.onnx_pb.TensorProto.INT8:
-        # [-64, 64] for reduce_range, and [-127, 127] full_range.
-        return 128 if reduce_range else 254
+def auto_detect_ep():
+    eps = ort.get_available_providers()
+    if "DnnlExecutionProvider" in eps:
+        return "DnnlExecutionProvider"
+    elif "DmlExecutionProvider" in eps:
+        return "DnnlExecutionProvider"
+    elif "CUDAExecutionProvider" in eps:
+        return "CUDAExecutionProvider"
     else:
-        raise ValueError("unsupported quantization data type")
-
+        return "CPUExecutionProvider"
 
-def _quantize_data_with_scale_zero(data, qType, scheme, scale, zero_point):
-    """Quantize data with scale and zero point.
-
-    To pack weights, we compute a linear transformation
-        - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and
-        - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where
-            m = max(abs(rmin), abs(rmax))
-
-    Args:
-        data (np.array): data to quantize
-        qType (int): data type to quantize to. Supported types UINT8 and INT8
-        scheme (string): sym or asym quantization.
-        scale (float): computed scale of quantized data
-        zero_point (uint8 or int8): computed zero point of quantized data
-    """
-    data = np.asarray(data)
-    if qType == onnx.onnx_pb.TensorProto.INT8 and scheme == "sym":
-        # signed byte type
-        quantized_data = (data.astype(np.float32) / scale).round().astype("b")
-    elif qType == onnx.onnx_pb.TensorProto.UINT8 and scheme == "asym":
-        quantized_data = ((data.astype(np.float32) / scale).round() + zero_point).astype("B")
-    else:
-        raise ValueError("Unexpected combination of data type {} and scheme {}.".format(qType, scheme))
-    return quantized_data
-
-
-def _calculate_scale_zp(rmin, rmax, quantize_range, qType, scheme):
-    """Calculate scale and zero point."""
-    if isinstance(rmax, np.ndarray):
-        if scheme == "sym":
-            max_range = np.maximum(abs(rmin), abs(rmax))
-            scale = np.ones(rmax.shape, dtype="float32")
-            scale[max_range > 0] = np.array(
-                [float(i) / quantize_range for i in (max_range[max_range > 0] * 2.0).flatten().tolist()],
-                dtype="float32",
-            )
-        else:
-            scale = np.ones(rmax.shape, dtype="float32")
-            scale[rmin != rmax] = np.array(
-                [float(i) / quantize_range for i in (rmax - rmin)[rmin != rmax].flatten().tolist()], dtype="float32"
-            )
-
-        if scheme == "sym" and qType == onnx.onnx_pb.TensorProto.INT8:
-            zero_point = np.zeros(scale.shape, dtype="int8") if isinstance(scale, np.ndarray) else 0
-        elif isinstance(scale, np.ndarray) and (scale == 1).all():
-            zero_point = (
-                np.zeros(scale.shape, dtype="int8")
-                if qType == onnx.onnx_pb.TensorProto.INT8
-                else np.zeros(scale.shape, dtype="uint8")
-            )
-        elif qType == onnx.onnx_pb.TensorProto.UINT8:
-            zero_point = np.maximum(0, np.minimum(255, ((0 - float(rmin)) / scale).round()).round()).astype("uint8")
-        else:
-            zero_point = (
-                (-64 - rmin) / float(scale) if quantize_range == 128 else (-127 - rmin) / float(scale)
-            ).round()
 
+def trt_env_setup(model):
+    """Set environment variable for Tensorrt Execution Provider."""
+    is_int8 = False
+    for node in model.graph.node:
+        if node.op_type in ["QuantizeLinear", "DequantizeLinear"]:
+            is_int8 = True
+            break
+    if is_int8:
+        os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1"
     else:
-        if scheme == "sym":
-            max_range = max(abs(rmin), abs(rmax))
-            scale = (float(max_range) * 2) / quantize_range if max_range > 0 else 1
-        else:
-            scale = (float(rmax) - float(rmin)) / quantize_range if rmin != rmax else 1
-
-        if scale == 1 or (scheme == "sym" and qType == onnx.onnx_pb.TensorProto.INT8):
-            zero_point = 0
-        elif qType == onnx.onnx_pb.TensorProto.UINT8:
-            zero_point = round((0 - float(rmin)) / scale)
-            zero_point = np.uint8(round(max(0, min(255, zero_point))))
-        else:
-            zero_point = (
-                round((-64 - float(rmin)) / scale) if quantize_range == 128 else round((-127 - float(rmin)) / scale)
-            )
-    return scale, zero_point
-
-
-def quantize_data(data, quantize_range, qType, scheme):
-    """Quantize data.
-
-    To pack weights, we compute a linear transformation
-        - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and
-        - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where
-            m = max(abs(rmin), abs(rmax))
-    and add necessary intermediate nodes to transform quantized weight to full weight
-    using the equation r = S(q-z), where
-        r: real original value
-        q: quantized value
-        S: scale
-        z: zero point
-
-    Args:
-        data (array): data to quantize
-        quantize_range (list): list of data to weight pack.
-        qType (int): data type to quantize to. Supported types UINT8 and INT8
-        scheme (string): sym or asym quantization.
-    """
-    rmin = min(min(data), 0)
-    rmax = max(max(data), 0)
-
-    scale, zero_point = _calculate_scale_zp(rmin, rmax, quantize_range, qType, scheme)
-    quantized_data = _quantize_data_with_scale_zero(data, qType, scheme, scale, zero_point)
-    return rmin, rmax, zero_point, scale, quantized_data
-
-
-def check_model_with_infer_shapes(model):
-    """Check if the model has been shape inferred."""
-    if isinstance(model, (pathlib.Path, str)):
-        model = onnx.load(model, load_external_data=False)
-    elif isinstance(model, onnx_model.ONNXModel):
-        model = model.model
-    if len(model.graph.value_info) > 0:
-        return True
-    return False
+        os.environ["ORT_TENSORRT_INT8_ENABLE"] = "0"
diff --git a/onnx_neural_compressor/version.py b/onnx_neural_compressor/version.py
index aa0978f16..08d071fc2 100644
--- a/onnx_neural_compressor/version.py
+++ b/onnx_neural_compressor/version.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-#
 # Copyright (c) 2021 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/requirements.txt b/requirements.txt
index d02ba0d77..7e4911f78 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,5 @@ psutil
 py-cpuinfo
 pydantic
 transformers
+prettytable
+scipy
diff --git a/test/quantization/layer_wise/test_layer_wise.py b/test/quantization/layer_wise/test_layer_wise.py
index af0bca3e4..7988cd3f6 100644
--- a/test/quantization/layer_wise/test_layer_wise.py
+++ b/test/quantization/layer_wise/test_layer_wise.py
@@ -10,9 +10,9 @@
 import transformers
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, data_reader, logger
+from onnx_neural_compressor import data_reader, logger
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import matmul_4bits_quantizer
+from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer
 
 
 def find_onnx_file(folder_path):
@@ -134,6 +134,7 @@ def test_rtn_layer_wise_with_ort_like_api(self):
         quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(
             copy.deepcopy(self.llama),
             algo_config=algo_config,
+            optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
         )
         quant.process()
         qmodel = quant.model
@@ -145,6 +146,7 @@ def test_rtn_layer_wise_with_ort_like_api(self):
         quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(
             copy.deepcopy(self.llama),
             algo_config=algo_config,
+            optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
         )
         quant.process()
         qmodel_lwq = quant.model
@@ -183,6 +185,7 @@ def test_gptq_layer_wise_with_ort_like_api(self):
         quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(
             copy.deepcopy(self.llama),
             algo_config=algo_config,
+            optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
         )
         quant.process()
         qmodel = quant.model
@@ -196,6 +199,7 @@ def test_gptq_layer_wise_with_ort_like_api(self):
         quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(
             copy.deepcopy(self.llama),
             algo_config=algo_config,
+            optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
         )
         quant.process()
         qmodel_lwq = quant.model
diff --git a/test/quantization/post_training_quant/test_calibrate.py b/test/quantization/post_training_quant/test_calibrate.py
new file mode 100644
index 000000000..a02880d4a
--- /dev/null
+++ b/test/quantization/post_training_quant/test_calibrate.py
@@ -0,0 +1,588 @@
+import os
+import shutil
+import sys
+import unittest
+
+import numpy as np
+import onnx
+
+from onnx_neural_compressor import data_reader
+from onnx_neural_compressor.algorithms.post_training_quant import calibrate, calibrator
+
+
+def generate_input_initializer(tensor_shape, tensor_dtype, input_name):
+    """Helper function to generate initializers for test inputs."""
+    tensor = np.random.ranf(tensor_shape).astype(tensor_dtype)
+    init = onnx.numpy_helper.from_array(tensor, input_name)
+    return init
+
+
+class DataReader(data_reader.CalibrationDataReader):
+
+    def __init__(self):
+        self.data_list = []
+        self.data_list.append(
+            {
+                "input0": np.array([[[[0.45, 0.60, 0.75]], [[0.25, 0.50, 0.75]], [[0.90, 0.70, 0.50]]]]).astype(
+                    np.float32
+                )
+            }
+        )
+        self.data_list.append(
+            {
+                "input0": np.array([[[[0.62, 0.94, 0.38]], [[0.70, 0.13, 0.07]], [[0.89, 0.75, 0.84]]]]).astype(
+                    np.float32
+                )
+            }
+        )
+        self.data_list.append(
+            {
+                "input0": np.array([[[[0.64, 0.24, 0.97]], [[0.82, 0.58, 0.27]], [[0.019, 0.34, 0.02]]]]).astype(
+                    np.float32
+                )
+            }
+        )
+        self.enum_data = None
+
+    def get_next(self):
+        if self.enum_data is None:
+            self.enum_data = iter(self.data_list)
+        return next(self.enum_data, None)
+
+    def rewind(self):
+        self.enum_data = None
+
+
+class DataReader2(data_reader.CalibrationDataReader):
+
+    def __init__(self):
+        self.data_list = []
+        self.data_list.append({"A": np.random.random([1, 1, 5, 5]).astype(np.float32)})
+        self.data_list.append({"A": np.random.random([1, 1, 5, 5]).astype(np.float32)})
+        self.data_list.append({"A": np.random.random([1, 1, 5, 5]).astype(np.float32)})
+        self.enum_data = None
+
+    def get_next(self):
+        if self.enum_data is None:
+            self.enum_data = iter(self.data_list)
+        return next(self.enum_data, None)
+
+    def rewind(self):
+        self.enum_data = None
+
+
+def create_cv_session():
+    A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+    B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3])
+    b_value = np.random.randn(1, 1, 3, 3).astype(np.float32)
+    B_init = onnx.helper.make_tensor("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3], b_value.reshape(9).tolist())
+    D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+    conv_node = onnx.helper.make_node("Conv", ["A", "B"], ["C"], name="conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1])
+    relu_node = onnx.helper.make_node("Relu", ["C"], ["D"], name="relu")
+    graph = onnx.helper.make_graph([conv_node, relu_node], "test_graph_1", [A], [D], [B_init])
+    model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]})
+    dataloader = DataReader2()
+    return model, dataloader
+
+
+class TestCalibrate(unittest.TestCase):
+    work_space = "./onnxrt_calib_test"
+
+    @classmethod
+    def setUpClass(cls):
+        os.makedirs(cls.work_space)
+        cls.cv_session = create_cv_session()
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.work_space, ignore_errors=True)
+
+    def test_dump_calibration(self):
+        model, dataloader = self.cv_session
+        augment = calibrate.ONNXRTAugment(model, dataloader, ["Conv", "Relu"], iterations=[0])
+        calib_params = augment.dump_calibration({})
+        self.assertTrue("A" in calib_params and "B" in calib_params and "D" in calib_params and "C" in calib_params)
+
+    def test_augment_graph(self):
+        """TEST_CONFIG_1."""
+
+        #     Conv
+        #      |
+        #     Clip
+        #      |
+        #     MatMul
+
+        A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3])
+        E = onnx.helper.make_tensor_value_info("E", onnx.TensorProto.FLOAT, [1, 1, 5, 1])
+        F = onnx.helper.make_tensor_value_info("F", onnx.TensorProto.FLOAT, [1, 1, 5, 1])
+        conv_node = onnx.helper.make_node(
+            "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1]
+        )
+        clip_node = onnx.helper.make_node("Clip", ["C"], ["D"], name="Clip")
+        matmul_node = onnx.helper.make_node("MatMul", ["D", "E"], ["F"], name="MatMul")
+        graph = onnx.helper.make_graph([conv_node, clip_node, matmul_node], "test_graph_1", [A, B, E], [F])
+        model = onnx.helper.make_model(graph)
+
+        # Augmenting graph
+        data_reader = None
+        augment = calibrate.ONNXRTAugment(model, data_reader, ["Conv", "MatMul"])
+        augment.augment_graph()
+        augmented_model = augment.augmented_model
+
+        # Checking if output exists
+        augmented_model_node_names = [node.name for node in augmented_model.graph.node]
+        augmented_model_outputs = [output.name for output in augmented_model.graph.output]
+        added_node_names = ["Conv", "Clip", "MatMul"]
+        added_outputs = ["A", "B", "C", "D", "E", "F"]
+        # Original 3 nodes (exclude graph input/output)
+        self.assertEqual(len(augmented_model_node_names), 3)
+        # Original 1 graph output + 5 intermediate outputs
+        self.assertEqual(len(augmented_model_outputs), 6)
+        for name in added_node_names:
+            self.assertTrue(name in augmented_model_node_names)
+        for output in added_outputs:
+            self.assertTrue(output in augmented_model_outputs)
+
+        print("Finished TEST_CONFIG_1")
+        """TEST_CONFIG_2."""
+
+        #   Conv
+        #    |
+        #   Conv
+
+        G = onnx.helper.make_tensor_value_info("G", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        H = onnx.helper.make_tensor_value_info("H", onnx.TensorProto.FLOAT, [1, 1, 3, 3])
+        J = onnx.helper.make_tensor_value_info("J", onnx.TensorProto.FLOAT, [1, 1, 3, 3])
+        K = onnx.helper.make_tensor_value_info("K", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        conv_node_1 = onnx.helper.make_node(
+            "Conv", ["G", "H"], ["I"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1]
+        )
+        conv_node_2 = onnx.helper.make_node(
+            "Conv", ["I", "J"], ["K"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1]
+        )
+        graph = onnx.helper.make_graph([conv_node_1, conv_node_2], "test_graph_2", [G, H, J], [K])
+        model = onnx.helper.make_model(graph)
+
+        # Augmenting graph
+        data_reader = None
+        augment = calibrate.ONNXRTAugment(
+            model,
+            data_reader,
+            ["Conv", "MatMul"],
+        )
+        augment.augment_graph()
+        augmented_model = augment.augmented_model
+
+        augmented_model_node_names = [node.name for node in augmented_model.graph.node]
+        augmented_model_outputs = [output.name for output in augmented_model.graph.output]
+        added_node_names = ["Conv", "Conv"]
+        added_outputs = ["I", "J", "H", "G", "K"]
+        # Original 2 nodes
+        self.assertEqual(len(augmented_model_node_names), 2)
+        # Original 1 graph output + 4 intermediate outputs
+        self.assertEqual(len(augmented_model_outputs), 5)
+        for name in added_node_names:
+            self.assertTrue(name in augmented_model_node_names)
+        for output in added_outputs:
+            self.assertTrue(output in augmented_model_outputs)
+
+        print("Finished TEST_CONFIG_2")
+        """TEST_CONFIG_3."""
+
+        #   Relu
+        #    |
+        #   Conv  \
+        #    |     |
+        #   Clip   |
+        #    |    /
+        #   MatMul
+
+        L = onnx.helper.make_tensor_value_info("L", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        N = onnx.helper.make_tensor_value_info("N", onnx.TensorProto.FLOAT, [1, 1, 3, 3])
+        Q = onnx.helper.make_tensor_value_info("Q", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        relu_node = onnx.helper.make_node("Relu", ["L"], ["M"], name="Relu")
+        conv_node = onnx.helper.make_node(
+            "Conv", ["M", "N"], ["O"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1]
+        )
+        clip_node = onnx.helper.make_node("Clip", ["O"], ["P"], name="Clip")
+        matmul_node = onnx.helper.make_node("MatMul", ["P", "M"], ["Q"], name="MatMul")
+        graph = onnx.helper.make_graph([relu_node, conv_node, clip_node, matmul_node], "test_graph_3", [L, N], [Q])
+        model = onnx.helper.make_model(graph)
+
+        # Augmenting graph
+        data_reader = None
+        augment = calibrate.ONNXRTAugment(model, data_reader, ["Conv", "MatMul"])
+        augment.augment_graph()
+        augmented_model = augment.augmented_model
+
+        augmented_model_node_names = [node.name for node in augmented_model.graph.node]
+        augmented_model_outputs = [output.name for output in augmented_model.graph.output]
+        added_node_names = ["Relu", "Conv", "Clip", "MatMul"]
+        added_outputs = ["P", "M", "N", "O", "Q"]
+        # Original 4 nodes
+        self.assertEqual(len(augmented_model_node_names), 4)
+        # Original 1 graph output + 4 intermediate outputs
+        self.assertEqual(len(augmented_model_outputs), 5)
+        for name in added_node_names:
+            self.assertTrue(name in augmented_model_node_names)
+        for output in added_outputs:
+            self.assertTrue(output in augmented_model_outputs)
+
+        print("Finished TEST_CONFIG_3")
+        """TEST_CONFIG_4."""
+
+        #   Attention
+        #    |
+        #   MatMul
+
+        Attention_weight = onnx.helper.make_tensor_value_info("Attention_weight", onnx.TensorProto.FLOAT, [13, 7])
+        Attention_bias = onnx.helper.make_tensor_value_info("Attention_bias", onnx.TensorProto.FLOAT, [13, 7])
+        Attention_mask = onnx.helper.make_tensor_value_info("Attention_mask", onnx.TensorProto.INT32, [13, 7])
+        S = onnx.helper.make_tensor_value_info("S", onnx.TensorProto.FLOAT, [13, 7])
+        T = onnx.helper.make_tensor_value_info("T", onnx.TensorProto.FLOAT, [13, 7])
+        attention_node = onnx.helper.make_node(
+            "Attention", ["Attention_weight", "Attention_bias", "Attention_mask"], ["R"], name="Attention"
+        )
+        matmul_node = onnx.helper.make_node("MatMul", ["R", "S"], ["T"], name="MatMul")
+        graph = onnx.helper.make_graph(
+            [attention_node, matmul_node], "test_graph_4", [Attention_weight, Attention_bias, Attention_mask, S], [T]
+        )
+        model = onnx.helper.make_model(graph)
+
+        # Augmenting graph
+        data_reader = None
+        augment = calibrate.ONNXRTAugment(model, data_reader, ["Conv", "MatMul", "Attention"])
+        augment.augment_graph()
+        augmented_model = augment.augmented_model
+
+        augmented_model_node_names = [node.name for node in augmented_model.graph.node]
+        augmented_model_outputs = [output.name for output in augmented_model.graph.output]
+        added_node_names = ["Attention", "MatMul"]
+        added_outputs = ["R", "Attention_mask", "S", "T", "Attention_bias", "Attention_weight"]
+        # Original 2 nodes
+        self.assertEqual(len(augmented_model_node_names), 2)
+        # Original 1 graph output + 5 intermediate outputs
+        self.assertEqual(len(augmented_model_outputs), 6)
+        for name in added_node_names:
+            self.assertTrue(name in augmented_model_node_names)
+        for output in added_outputs:
+            self.assertTrue(output in augmented_model_outputs)
+
+        print("Finished TEST_CONFIG_4")
+
+        #    QAttention
+        #        |
+        #    QuantizeLinear
+
+        Attention_input = onnx.helper.make_tensor_value_info("input_quantized", onnx.TensorProto.INT8, [7, 13])
+        Attention_weight = onnx.helper.make_tensor_value_info("weight_quantized", onnx.TensorProto.INT8, [13, 7])
+        weight_quantized = generate_input_initializer([13, 7], np.int8, "weight_quantized")
+        Attention_bias = onnx.helper.make_tensor_value_info("bias", onnx.TensorProto.FLOAT, [13, 7])
+        bias = generate_input_initializer([13, 7], np.float32, "bias")
+        Input_scale = onnx.helper.make_tensor_value_info("input_scale", onnx.TensorProto.FLOAT, [1])
+        input_scale = generate_input_initializer([1], np.float32, "input_scale")
+        Weight_scale = onnx.helper.make_tensor_value_info("weight_scale", onnx.TensorProto.FLOAT, [1])
+        weight_scale = generate_input_initializer([1], np.float32, "weight_scale")
+        Attention_mask = onnx.helper.make_tensor_value_info("mask", onnx.TensorProto.INT32, [13, 7])
+        mask = generate_input_initializer([13, 7], np.int32, "mask")
+        Input_zo = onnx.helper.make_tensor_value_info("input_zero_point", onnx.TensorProto.INT8, [1])
+        input_zero_point = generate_input_initializer([1], np.int8, "input_zero_point")
+        Weight_zo = onnx.helper.make_tensor_value_info("weight_zero_point", onnx.TensorProto.INT8, [1])
+        weight_zero_point = generate_input_initializer([1], np.int8, "weight_zero_point")
+        Q_scale = onnx.helper.make_tensor_value_info("attn_output_scale", onnx.TensorProto.FLOAT, [1])
+        attn_output_scale = generate_input_initializer([1], np.float32, "attn_output_scale")
+        Q_zo = onnx.helper.make_tensor_value_info("attn_output_zero_point", onnx.TensorProto.INT8, [1])
+        attn_output_zero_point = generate_input_initializer([1], np.int8, "attn_output_zero_point")
+        Output = onnx.helper.make_tensor_value_info("attn_output_quantized", onnx.TensorProto.INT8, [13, 7])
+        attention_node = onnx.helper.make_node(
+            "QAttention",
+            [
+                "input_quantized",
+                "weight_quantized",
+                "bias",
+                "input_scale",
+                "weight_scale",
+                "mask",
+                "input_zero_point",
+                "weight_zero_point",
+            ],
+            ["attn_output"],
+            name="attention_quant",
+        )
+        qlinear_node = onnx.helper.make_node(
+            "QuantizeLinear",
+            ["attn_output", "attn_output_scale", "attn_output_zero_point"],
+            ["attn_output_quantized"],
+            name="attn_output_QuantizeLinear",
+        )
+        graph = onnx.helper.make_graph(
+            [attention_node, qlinear_node],
+            "test_graph_5",
+            [
+                Attention_input,
+                Attention_weight,
+                Attention_bias,
+                Input_scale,
+                Weight_scale,
+                Attention_mask,
+                Input_zo,
+                Weight_zo,
+                Q_scale,
+                Q_zo,
+            ],
+            [Output],
+        )
+        graph.initializer.add().CopyFrom(weight_quantized)
+        graph.initializer.add().CopyFrom(bias)
+        graph.initializer.add().CopyFrom(input_scale)
+        graph.initializer.add().CopyFrom(weight_scale)
+        graph.initializer.add().CopyFrom(mask)
+        graph.initializer.add().CopyFrom(input_zero_point)
+        graph.initializer.add().CopyFrom(weight_zero_point)
+        graph.initializer.add().CopyFrom(attn_output_scale)
+        graph.initializer.add().CopyFrom(attn_output_zero_point)
+        model = onnx.helper.make_model(graph)
+
+        # Augmenting graph
+        data_reader = None
+        augment = calibrate.ONNXRTAugment(model, data_reader, [], white_nodes=["attention"])
+        augment.augment_nodes = ["DequantizeLinear"]
+        augment.already_quantized = True
+
+        augment.augment_graph()
+        augmented_model = augment.augmented_model
+
+        augmented_model_node_names = [node.name for node in augmented_model.graph.node]
+        augmented_model_outputs = [output.name for output in augmented_model.graph.output]
+        added_node_names = ["attention_quant", "attn_output_QuantizeLinear", "input_quantized_DequantizeLinear"]
+        added_outputs = ["attn_output_quantized", "input_quantized_output", "attn_output"]
+        self.assertEqual(len(augmented_model_node_names), 3)
+        self.assertEqual(len(augmented_model_outputs), 3)
+        for name in added_node_names:
+            self.assertTrue(name in augmented_model_node_names)
+        for output in added_outputs:
+            self.assertTrue(output in augmented_model_outputs)
+
+        print("Finished TEST_CONFIG_5")
+
+        #    QuantizeLinear
+        #        |
+        #    QLinearConv
+        #        |
+        #    DequantizeLinear
+        A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        A_scale = onnx.helper.make_tensor_value_info("A_scale", onnx.TensorProto.FLOAT, [1])
+        a_scale = generate_input_initializer([1], np.float32, "A_scale")
+        A_zo = onnx.helper.make_tensor_value_info("A_zero_point", onnx.TensorProto.INT8, [1])
+        a_zero_point = generate_input_initializer([1], np.int8, "A_zero_point")
+        C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.INT8, [1, 1, 5, 5])
+        c = generate_input_initializer([1, 1, 5, 5], np.int8, "C")
+        C_scale = onnx.helper.make_tensor_value_info("C_scale", onnx.TensorProto.FLOAT, [1])
+        c_scale = generate_input_initializer([1], np.float32, "C_scale")
+        C_zo = onnx.helper.make_tensor_value_info("C_zero_point", onnx.TensorProto.INT8, [1])
+        c_zero_point = generate_input_initializer([1], np.int8, "C_zero_point")
+        E = onnx.helper.make_tensor_value_info("E", onnx.TensorProto.INT32, [1])
+        e = generate_input_initializer([1], np.int32, "E")
+        D_scale = onnx.helper.make_tensor_value_info("D_scale", onnx.TensorProto.FLOAT, [1])
+        d_scale = generate_input_initializer([1], np.float32, "D_scale")
+        D_zo = onnx.helper.make_tensor_value_info("D_zero_point", onnx.TensorProto.INT8, [1])
+        d_zero_point = generate_input_initializer([1], np.int8, "D_zero_point")
+        D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        quantize_node = onnx.helper.make_node(
+            "QuantizeLinear", ["A", "A_scale", "A_zero_point"], ["A_quantized"], name="A_QuantizeLinear"
+        )
+        conv_node = onnx.helper.make_node(
+            "QLinearConv",
+            [
+                "A_quantized",
+                "A_scale",
+                "A_zero_point",
+                "C_quantized",
+                "C_scale",
+                "C_zero_point",
+                "D_scale",
+                "D_zero_point",
+                "E",
+            ],
+            ["D_quantized"],
+            name="conv_quant",
+            kernel_shape=[3, 3],
+            pads=[1, 1, 1, 1],
+        )
+        dequantize_node = onnx.helper.make_node(
+            "DequantizeLinear", ["D_quantized", "D_scale", "D_zero_point"], ["D"], name="D_DequantizeLinear"
+        )
+        graph = onnx.helper.make_graph(
+            [quantize_node, conv_node, dequantize_node],
+            "test_graph_5",
+            [A, A_scale, A_zo, C, C_scale, C_zo, E, D_scale, D_zo],
+            [D],
+        )
+        graph.initializer.add().CopyFrom(a_scale)
+        graph.initializer.add().CopyFrom(a_zero_point)
+        graph.initializer.add().CopyFrom(c)
+        graph.initializer.add().CopyFrom(c_scale)
+        graph.initializer.add().CopyFrom(c_zero_point)
+        graph.initializer.add().CopyFrom(e)
+        graph.initializer.add().CopyFrom(d_scale)
+        graph.initializer.add().CopyFrom(d_zero_point)
+        model = onnx.helper.make_model(graph)
+
+        # Augmenting graph
+        data_reader = None
+        augment = calibrate.ONNXRTAugment(model, data_reader, [], white_nodes=["conv"])
+        augment.augment_nodes = ["DequantizeLinear"]
+        augment.already_quantized = True
+        augment.augment_graph()
+        augmented_model = augment.augmented_model
+
+        augmented_model_node_names = [node.name for node in augmented_model.graph.node]
+        augmented_model_outputs = [output.name for output in augmented_model.graph.output]
+        added_node_names = [
+            "A_QuantizeLinear",
+            "conv_quant",
+            "D_DequantizeLinear",
+            "D_quantized_DequantizeLinear",
+            "A_quantized_DequantizeLinear",
+        ]
+        added_outputs = ["D", "D_quantized_output", "A_quantized_output"]
+        self.assertEqual(len(augmented_model_node_names), 5)
+        self.assertEqual(len(augmented_model_outputs), 3)
+        for name in added_node_names:
+            self.assertTrue(name in augmented_model_node_names)
+        for output in added_outputs:
+            self.assertTrue(output in augmented_model_outputs)
+
+    def test_quant_param_calculation(self):
+        """TEST_CONFIG_6."""
+
+        #   Relu
+        #    |      \
+        #   Conv     \
+        #    |        \
+        #   Relu       |
+        #    |       Conv
+        #   Conv      /
+        #      \     /
+        #         |
+        #        Add
+
+        input0 = onnx.helper.make_tensor_value_info("input0", onnx.TensorProto.FLOAT, [1, 3, 1, 3])
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, [1, 3, 1, 3])
+
+        X1_weight = generate_input_initializer([3, 3, 1, 1], np.float32, "X1_weight")
+        X1_bias = generate_input_initializer([3], np.float32, "X1_bias")
+        X3_weight = generate_input_initializer([3, 3, 1, 1], np.float32, "X3_weight")
+        X3_bias = generate_input_initializer([3], np.float32, "X3_bias")
+        X5_weight = generate_input_initializer([3, 3, 1, 1], np.float32, "X5_weight")
+        X5_bias = generate_input_initializer([3], np.float32, "X5_bias")
+
+        relu_node_1 = onnx.helper.make_node("Relu", ["input0"], ["X1"], name="Relu1")
+        conv_node_1 = onnx.helper.make_node("Conv", ["X1", "X1_weight", "X1_bias"], ["X2"], name="Conv1")
+        relu_node_2 = onnx.helper.make_node("Relu", ["X2"], ["X3"], name="Relu2")
+        conv_node_2 = onnx.helper.make_node("Conv", ["X3", "X3_weight", "X3_bias"], ["X4"], name="Conv2")
+        conv_node_3 = onnx.helper.make_node("Conv", ["X1", "X5_weight", "X5_bias"], ["X5"], name="Conv3")
+        add_node = onnx.helper.make_node("Add", ["X4", "X5"], ["output"], name="Add")
+
+        graph = onnx.helper.make_graph(
+            [relu_node_1, conv_node_1, relu_node_2, conv_node_2, conv_node_3, add_node],
+            "test_graph_5",
+            [input0],
+            [output],
+        )
+        graph.initializer.add().CopyFrom(X1_weight)
+        graph.initializer.add().CopyFrom(X1_bias)
+        graph.initializer.add().CopyFrom(X3_weight)
+        graph.initializer.add().CopyFrom(X3_bias)
+        graph.initializer.add().CopyFrom(X5_weight)
+        graph.initializer.add().CopyFrom(X5_bias)
+        model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]})
+        data_reader = DataReader()
+        augment = calibrate.ONNXRTAugment(model, data_reader, ["Conv", "MatMul"])
+
+        # test calculation of quantization params
+        data_reader.rewind()
+        quantization_params_dict = augment.dump_calibration({})
+        data_reader.rewind()
+        node_output_names, output_dicts_list = augment.get_intermediate_outputs({})
+        data_reader.rewind()
+        dict_for_quantization = augment._map_calibration(node_output_names, output_dicts_list)
+        # check the size of the quantization dictionary
+
+        self.assertEqual(len(quantization_params_dict), 12)
+
+        # check the computation of zp and scale
+        for key, value in quantization_params_dict.items():
+            self.assertTrue(value is not None)
+            self.assertTrue(len(value) == 2)
+
+            thresholds = dict_for_quantization[key]
+            rmin = min(thresholds[0], 0)
+            rmax = max(thresholds[1], 0)
+            if key == "X2":  # next_node is Relu
+                if rmin < 0:
+                    rmin = 0
+
+            scale_expected = np.float32((rmax - rmin) / 255 if rmin != rmax else 1)
+            zp_expected = np.uint8(round(max(0, min(255, (0 - rmin) / scale_expected))))
+            zp_actual = value[0]
+            scale_actual = value[1]
+
+            self.assertAlmostEqual(zp_expected, zp_actual)
+            self.assertAlmostEqual(scale_expected, scale_actual)
+
+        print("Finished" + " test calculation of quantization params.")
+
+    def test_calibrator(self):
+        regular_data = [np.arange(15).reshape(3, 5).astype("float32"), np.arange(15).reshape(3, 5).astype("float32")]
+        irregular_data = [np.arange(10).reshape(2, 5).astype("float32"), np.arange(5).reshape(1, 5).astype("float32")]
+
+        calib = calibrator.CALIBRATOR["MinMax"]()
+        calib.collect(irregular_data)
+        res = calib.calib_range
+        self.assertEqual(res[0], np.array(0.0).astype(np.float32))
+        self.assertEqual(res[1], np.array(9.0).astype(np.float32))
+        calib.collect(regular_data)
+        res = calib.calib_range
+        self.assertEqual(res[0], np.array(0.0).astype(np.float32))
+        self.assertEqual(res[1], np.array(14.0).astype(np.float32))
+        calib.clear()
+        res = calib.calib_range
+        self.assertIsNone(res[0])
+        self.assertIsNone(res[1])
+        del calib
+
+        calib = calibrator.CALIBRATOR["Entropy"]()
+        calib.collect(irregular_data)
+        res = calib.calib_range
+        self.assertEqual(res[0], np.array(0.0).astype(np.float32))
+        self.assertEqual(res[1], np.array(9.0).astype(np.float32))
+        calib.collect(regular_data)
+        res = calib.calib_range
+        self.assertEqual(res[0], np.array(0.0).astype(np.float32))
+        self.assertEqual(res[1], np.array(9.140625).astype(np.float32))
+        calib.clear()
+        res = calib.calib_range
+        self.assertIsNone(res[0])
+        self.assertIsNone(res[1])
+        del calib
+
+        calib = calibrator.CALIBRATOR["Percentile"]()
+        calib.collect(irregular_data)
+        res = calib.calib_range
+        self.assertEqual(res[0], np.array(0.0).astype(np.float32))
+        self.assertEqual(res[1], np.array(8.991211).astype(np.float32))
+        calib.collect(regular_data)
+        res = calib.calib_range
+        self.assertEqual(res[0], np.array(0.0).astype(np.float32))
+        self.assertEqual(res[1], np.array(13.9921875).astype(np.float32))
+        calib.clear()
+        res = calib.calib_range
+        self.assertIsNone(res[0])
+        self.assertIsNone(res[1])
+        del calib
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/quantization/post_training_quant/test_operators.py b/test/quantization/post_training_quant/test_operators.py
new file mode 100644
index 000000000..45c189328
--- /dev/null
+++ b/test/quantization/post_training_quant/test_operators.py
@@ -0,0 +1,1957 @@
+import collections
+import copy
+import os
+import shutil
+import unittest
+
+import numpy as np
+import onnx
+import onnxruntime as ort
+
+from onnx_neural_compressor import quantization
+from onnx_neural_compressor.algorithms.post_training_quant import quantizer
+
+
+def build_model():
+    initializers = []
+    input = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 3, 15, 15])
+    output = onnx.helper.make_tensor_value_info("add_out_2", onnx.TensorProto.FLOAT, [88, 11])
+
+    add_node = onnx.helper.make_node("Add", ["input", "add_init"], ["add_out"], name="add")
+
+    conv1_weight_initializer = onnx.numpy_helper.from_array(
+        np.random.randint(-1, 2, [3, 3, 3, 3]).astype(np.float32), name="conv1_weight"
+    )
+    conv1_node = onnx.helper.make_node("Conv", ["add_out", "conv1_weight"], ["conv1_output"], name="conv1")
+
+    conv2_weight_initializer = onnx.numpy_helper.from_array(
+        np.random.randint(-1, 2, [5, 3, 3, 3]).astype(np.float32), name="conv2_weight"
+    )
+    conv2_node = onnx.helper.make_node("Conv", ["add_out", "conv2_weight"], ["conv2_output"], name="conv2")
+
+    # 1, 8, 13, 13
+    concat_node = onnx.helper.make_node(
+        "Concat", ["conv1_output", "conv2_output"], ["concat_output"], name="Concat", axis=1
+    )
+    # 1, 8, 11, 11
+    avg_args = {"kernel_shape": [3, 3]}
+    avgpool_node = onnx.helper.make_node(
+        "AveragePool", ["concat_output"], ["avg_output"], name="AveragePool", **avg_args
+    )
+    reshape_node = onnx.helper.make_node("Reshape", ["avg_output", "shape"], ["reshape_output"], name="Reshape")
+
+    add_node_2 = onnx.helper.make_node("Add", ["reshape_output", "add_init_2"], ["add_out_2"], name="add_2")
+
+    initializers = [conv1_weight_initializer, conv2_weight_initializer]
+    initializers.append(onnx.numpy_helper.from_array(np.array([88, 11], dtype=np.int64), name="shape"))
+    initializers.append(onnx.numpy_helper.from_array(np.zeros((1, 3, 15, 15)).astype("float32"), name="add_init"))
+    initializers.append(onnx.numpy_helper.from_array(np.zeros((88, 11)).astype("float32"), name="add_init_2"))
+
+    graph = onnx.helper.make_graph(
+        [conv1_node, conv2_node, concat_node, avgpool_node, reshape_node, add_node, add_node_2],
+        "test",
+        [input],
+        [output],
+        initializer=initializers,
+    )
+    model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+    return model
+
+
+class TestQuantizer(unittest.TestCase):
+    qlinear_backend = "qoperator"
+    qdq_backend = "qdq"
+
+    q_config = {
+        "weight_type": 3,
+        "activation_type": 2,
+        "per_channel": False,
+        "weight_sym": True,
+        "activation_sym": False,
+        "calibrate_method": "MinMax",
+    }
+
+    @classmethod
+    def setUpClass(cls):
+        os.makedirs("./onnxrt_test")
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree("./onnxrt_test", ignore_errors=True)
+
+    def qlinear_test(self, model, q_config, quantize_params, quantizable_op_types, **kwargs):
+        quant = quantizer.StaticQuantizer(
+            model=copy.deepcopy(model),
+            q_config=q_config,
+            quant_format=self.qlinear_backend,
+            quantization_params=quantize_params,
+            op_types_to_quantize=quantizable_op_types,
+            **kwargs,
+        )
+        quant.quantize_model()
+        assert quant.model.model
+        return quant.model
+
+    def qdq_test(self, model, q_config, quantize_params, quantizable_op_types, **kwargs):
+        quant = quantizer.StaticQuantizer(
+            model=copy.deepcopy(model),
+            q_config=q_config,
+            quant_format=self.qdq_backend,
+            quantization_params=quantize_params,
+            op_types_to_quantize=quantizable_op_types,
+            **kwargs,
+        )
+        quant.quantize_model()
+        assert quant.model.model
+        return quant.model
+
+    def dynamic_test(self, model, q_config, quantize_params, quantizable_op_types, **kwargs):
+        quant = quantizer.DynamicQuantizer(
+            model=copy.deepcopy(model),
+            q_config=q_config,
+            quantization_params=quantize_params,
+            op_types_to_quantize=quantizable_op_types,
+            **kwargs,
+        )
+        quant.quantize_model()
+        assert quant.model.model
+        return quant.model
+
+    def test_resize(self):
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 2, 26, 42])
+
+        conv_weight_arr = np.random.randint(-1, 2, [3, 2, 3, 3]).astype(np.float32)
+        conv_weight_initializer = onnx.numpy_helper.from_array(conv_weight_arr, name="conv1_weight")
+        conv_node = onnx.helper.make_node("Conv", ["input", "conv1_weight"], ["conv_output"], name="conv_node")
+
+        initializers = [conv_weight_initializer]
+
+        output_tensor = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, [1, 3, 48, 80])
+        resize_inputs = ["conv_output"]  # resize_roi_name, resize_scales_name, resize_sizes_name]
+        resize_attrs = {"coordinate_transformation_mode": "asymmetric", "mode": "nearest", "nearest_mode": "floor"}
+        resize_node = onnx.helper.make_node("Resize", resize_inputs, ["output"], name="resize_node", **resize_attrs)
+        resize_roi = [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0]
+        resize_roi_name = "resize_roi"
+        resize_roi_initializer = onnx.helper.make_tensor(
+            resize_roi_name, onnx.TensorProto.FLOAT, [len(resize_roi)], resize_roi
+        )
+        initializers.extend([resize_roi_initializer])
+        resize_node.input.extend([resize_roi_name])
+
+        resize_scales = [1.0, 1.0, 2.0, 2.0]
+        resize_scales_name = "resize_scales"
+        resize_scales_initializer = onnx.helper.make_tensor(
+            resize_scales_name, onnx.TensorProto.FLOAT, [len(resize_scales)], resize_scales
+        )
+        initializers.extend([resize_scales_initializer])
+        resize_node.input.extend([resize_scales_name])
+
+        graph = onnx.helper.make_graph(
+            [conv_node, resize_node],
+            "TestOpQuantizerResize_test_model",
+            [input_tensor],
+            [output_tensor],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7  # use stable onnx ir version
+
+        q_config = {"conv_node": self.q_config, "resize_node": self.q_config}
+        quantize_params = {
+            "input": [np.uint8(0), np.float32(10.0)],
+            "conv1_weight": [np.uint8(0), np.float32(10.0)],
+            "conv_output": [np.uint8(0), np.float32(10.0)],
+            "output": [np.uint8(0), np.float32(10.0)],
+        }
+
+        q_model = self.qlinear_test(model, q_config, quantize_params, ["Resize", "Conv"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, ["Resize", "Conv"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3)
+
+        # test opset version 10
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 10)])
+        model.ir_version = 7  # use stable onnx ir version
+
+        q_model = self.qlinear_test(model, q_config, quantize_params, ["Resize", "Conv"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, ["Resize", "Conv"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+
+    def test_argmax(self):
+        input_name = "input"
+        output_name = "output"
+        input_shape = [1, 256, 128, 128]
+        output_shape = [1, 32, 128]
+        initializers = []
+
+        # make Conv node
+        conv_weight_name = "conv_weight"
+        conv_weight_arr = np.random.randint(-1, 2, [32, 256, 1, 1]).astype(np.float32)
+        conv_weight_initializer = onnx.numpy_helper.from_array(conv_weight_arr, name=conv_weight_name)
+        conv_output_name = "conv_output"
+        conv_inputs = [input_name, conv_weight_name]
+        conv_outputs = [conv_output_name]
+        conv_name = "conv_node"
+        conv_node = onnx.helper.make_node(
+            "Conv",
+            conv_inputs,
+            conv_outputs,
+            dilations=[1, 1],
+            kernel_shape=[1, 1],
+            pads=[0, 0, 0, 0],
+            strides=[1, 1],
+            name=conv_name,
+        )
+
+        # make ArgMax node
+        argmax_inputs = [conv_output_name]
+        argmax_outputs = [output_name]
+        argmax_name = "argmax_node"
+        argmax_node = onnx.helper.make_node(
+            "ArgMax",
+            argmax_inputs,
+            argmax_outputs,
+            axis=3,
+            keepdims=0,
+            name=argmax_name,
+        )
+
+        initializers = [conv_weight_initializer]
+
+        # make graph
+        input_tensor = onnx.helper.make_tensor_value_info(input_name, onnx.TensorProto.FLOAT, input_shape)
+        output_tensor = onnx.helper.make_tensor_value_info(output_name, onnx.TensorProto.INT64, output_shape)
+        graph_name = "ArgMax_Quant_Test"
+        graph = onnx.helper.make_graph(
+            [conv_node, argmax_node],
+            graph_name,
+            [input_tensor],
+            [output_tensor],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7  # use stable onnx ir version
+        q_config = {"conv_node": self.q_config, "argmax_node": self.q_config}
+        quantize_params = {
+            "input": [np.uint8(0), np.float32(10.0)],
+            "conv_weight": [np.uint8(0), np.float32(10.0)],
+            "conv_output": [np.uint8(0), np.float32(10.0)],
+            "output": [np.uint8(0), np.float32(10.0)],
+        }
+        q_model = self.qlinear_test(model, q_config, quantize_params, ["Conv", "ArgMax"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+
+    def test_gemm(self):
+        input_name = "input"
+        output_name = "output"
+        initializers = []
+        weight_shape = [100, 10]
+        weight_name = "linear1.weight"
+        bias_shape = [100]
+        bias_name = "linear1.bias"
+        node_name = "gemm"
+
+        weight_data = np.random.normal(0, 0.1, weight_shape).astype(np.float32)
+        initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
+
+        bias_data = np.random.normal(0, 0.1, bias_shape).astype(np.float32)
+        initializers.append(onnx.numpy_helper.from_array(bias_data, name=bias_name))
+
+        gemm1_node = onnx.helper.make_node(
+            "Gemm", [input_name, weight_name, bias_name], [output_name], alpha=1.0, beta=1.0, transB=1, name=node_name
+        )
+
+        gemm1_output_name = "gemm1_output"
+        input_tensor = onnx.helper.make_tensor_value_info(input_name, onnx.TensorProto.FLOAT, [-1, 10])
+        output_tensor = onnx.helper.make_tensor_value_info(output_name, onnx.TensorProto.FLOAT, [-1, 100])
+        graph_name = "gemm_test"
+        graph = onnx.helper.make_graph(
+            [gemm1_node],
+            graph_name,
+            [input_tensor],
+            [output_tensor],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7  # use stable onnx ir version
+        q_config = {"gemm": self.q_config}
+        quantize_params = {
+            "input": [np.uint8(0), np.float32(10.0)],
+            "linear1.weight": [np.uint8(0), np.float32(10.0)],
+            "linear1.bias": [np.uint8(0), np.float32(10.0)],
+            "output": [np.uint8(0), np.float32(10.0)],
+        }
+        q_model = self.qlinear_test(model, q_config, quantize_params, ["Gemm"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        q_model = self.qdq_test(model, q_config, quantize_params, ["Gemm"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+
+        # test gemm with non-constant bias
+        bias_tensor = onnx.helper.make_tensor_value_info(bias_name, onnx.TensorProto.FLOAT, [100])
+        gemm2_node = onnx.helper.make_node(
+            "Gemm", [input_name, weight_name, bias_name], [output_name], alpha=1.0, beta=1.0, transB=1, name=node_name
+        )
+        initializers = []
+        initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
+        graph_name = "gemm_test"
+        graph = onnx.helper.make_graph(
+            [gemm2_node],
+            graph_name,
+            [input_tensor, bias_tensor],
+            [output_tensor],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7
+        q_model = self.qlinear_test(model, q_config, quantize_params, ["Gemm"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 0)
+        q_model = self.qdq_test(model, q_config, quantize_params, ["Gemm"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+
+    def test_embed(self):
+        input_ids_shape = [1, 4]
+        input_ids_tensor = onnx.helper.make_tensor_value_info("input_ids", onnx.TensorProto.INT32, input_ids_shape)
+
+        segment_ids_shape = [1, 4]
+        segment_ids_tensor = onnx.helper.make_tensor_value_info(
+            "segment_ids", onnx.TensorProto.INT32, segment_ids_shape
+        )
+
+        # EmbedLayerNormalization Node Constants and Weights:
+        word_embed_shape = [32, 4]
+        word_embed_weights = np.random.random_sample(word_embed_shape).astype(dtype="float32")
+        word_embed_initializer = onnx.numpy_helper.from_array(word_embed_weights, name="word_embed")
+
+        pos_embed_shape = [16, 4]
+        pos_embed_weights = np.random.random_sample(pos_embed_shape).astype(dtype="float32")
+        pos_embed_initializer = onnx.numpy_helper.from_array(pos_embed_weights, name="pos_embed")
+
+        seg_embed_shape = [2, 4]
+        seg_embed_weights = np.random.random_sample(seg_embed_shape).astype(dtype="float32")
+        seg_embed_initializer = onnx.numpy_helper.from_array(seg_embed_weights, name="seg_embed")
+
+        gamma_shape = [4]
+        gamma = np.random.random_sample(gamma_shape).astype(dtype="float32")
+        gamma_initializer = onnx.numpy_helper.from_array(gamma, name="gamma")
+
+        beta_shape = [4]
+        beta = np.random.random_sample(beta_shape).astype(dtype="float32")
+        beta_initializer = onnx.numpy_helper.from_array(beta, name="beta")
+
+        # EmbedLayerNormalization Outputs:
+        layernorm_out_shape = [1, 4, 4]
+        layernorm_out_tensor = onnx.helper.make_tensor_value_info(
+            "layernorm_out", onnx.TensorProto.FLOAT, layernorm_out_shape
+        )
+
+        mask_index_out_shape = [1]
+        mask_index_out_tensor = onnx.helper.make_tensor_value_info(
+            "mask_index_out", onnx.TensorProto.INT32, mask_index_out_shape
+        )
+
+        # EmbedLayerNormalization Node:
+        embed_layer_norm_inputs = ["input_ids", "segment_ids", "word_embed", "pos_embed", "seg_embed", "gamma", "beta"]
+        embed_layer_norm_outputs = ["layernorm_out", "mask_index_out"]
+        embed_layer_norm_node = onnx.helper.make_node(
+            "EmbedLayerNormalization",
+            embed_layer_norm_inputs,
+            embed_layer_norm_outputs,
+            domain="com.microsoft",
+            name="Embed",
+        )
+
+        # Construct the Graph and Model:
+        nodes = [embed_layer_norm_node]
+        graph_name = "embed_layernorm_graph"
+        inputs = [input_ids_tensor, segment_ids_tensor]
+        outputs = [layernorm_out_tensor, mask_index_out_tensor]
+        initializers = [
+            word_embed_initializer,
+            pos_embed_initializer,
+            seg_embed_initializer,
+            gamma_initializer,
+            beta_initializer,
+        ]
+
+        graph = onnx.helper.make_graph(nodes, graph_name, inputs, outputs, initializer=initializers)
+        model = onnx.helper.make_model(
+            graph,
+            opset_imports=[onnx.helper.make_opsetid("com.microsoft", 14), onnx.helper.make_opsetid("ai.onnx", 14)],
+        )
+        model.ir_version = 7  # use stable onnx ir version
+
+        q_config = {"Embed": self.q_config}
+        quantize_params = {
+            "word_embed": [np.uint8(10.0), np.float32(0)],
+            "pos_embed": [np.uint8(10.0), np.float32(0)],
+            "seg_embed": [np.uint8(10.0), np.float32(0)],
+            "gamma": [np.uint8(10.0), np.float32(0)],
+            "beta": [np.uint8(10.0), np.float32(0)],
+            "layernorm_out": [np.uint8(10.0), np.float32(0)],
+            "mask_index_out": [np.uint8(10.0), np.float32(0)],
+            "input_ids": [np.uint8(10.0), np.float32(0)],
+        }
+        q_model = self.qlinear_test(model, q_config, quantize_params, ["EmbedLayerNormalization"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["QEmbedLayerNormalization"], 1
+        )
+
+        q_model = self.qdq_test(model, q_config, quantize_params, ["EmbedLayerNormalization"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 5
+        )
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["EmbedLayerNormalization"], 1
+        )
+
+    def test_LSTM(self):
+        input_shape = [1, 1, 200]
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, input_shape)
+
+        w_shape = [2, 400, 200]
+        w_weights = np.random.random_sample(w_shape).astype(dtype="float32")
+        w_init = onnx.numpy_helper.from_array(w_weights, name="w")
+
+        r_shape = [2, 400, 100]
+        r_weights = np.random.random_sample(r_shape).astype(dtype="float32")
+        r_init = onnx.numpy_helper.from_array(r_weights, name="r")
+
+        b_shape = [2, 800]
+        b_weights = np.random.random_sample(b_shape).astype(dtype="float32")
+        b_init = onnx.numpy_helper.from_array(b_weights, name="b")
+
+        out_shape = [1, 2, 1, 100]
+        out_tensor = onnx.helper.make_tensor_value_info("out", onnx.TensorProto.FLOAT, out_shape)
+
+        kwargs = {}
+        kwargs["direction"] = "bidirectional"
+        kwargs["activations"] = ["Sigmoid", "Tanh", "Tanh", "Sigmoid", "Tanh", "Tanh"]
+        kwargs["hidden_size"] = 100
+        kwargs["input_forget"] = 0
+
+        lstm_node = onnx.helper.make_node("LSTM", ["input", "w", "r", "b"], ["out"], name="lstm", domain="", **kwargs)
+        graph = onnx.helper.make_graph(
+            [lstm_node], "test", [input_tensor], [out_tensor], initializer=[w_init, r_init, b_init]
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 11)])
+        model.ir_version = 7  # use stable onnx ir version
+
+        q_config = {"lstm": self.q_config}
+        q_model = self.dynamic_test(model, q_config, None, ["LSTM"])
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLSTM"], 1
+        )
+
+    def test_concat_reshape_pooling(self):
+        model = build_model()
+
+        q_config = {
+            "Reshape": self.q_config,
+            "conv1": self.q_config,
+            "conv2": self.q_config,
+            "Concat": self.q_config,
+            "AveragePool": self.q_config,
+            "add": self.q_config,
+        }
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "conv1_weight": [np.uint8(10.0), np.float32(0)],
+            "conv1_output": [np.uint8(10.0), np.float32(0)],
+            "conv2_weight": [np.uint8(10.0), np.float32(0)],
+            "conv2_output": [np.uint8(10.0), np.float32(0)],
+            "concat_output": [np.uint8(10.0), np.float32(0)],
+            "avg_output": [np.uint8(10.0), np.float32(0)],
+            "add_out": [np.uint8(10.0), np.float32(0)],
+            "add_init": [np.uint8(10.0), np.float32(0)],
+            "shape": [np.uint8(10.0), np.float32(0)],
+            "reshape_output": [np.uint8(10.0), np.float32(0)],
+            "add_init_2": [np.uint8(10.0), np.float32(0)],
+            "add_out_2": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["Reshape", "Conv", "Concat", "AveragePool", "Add"]
+        q_model = self.qlinear_test(
+            model, q_config, quantize_params, quantizable_op_types, **{"dedicated_qdq_pair": True}
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types, **{"dedicated_qdq_pair": True})
+        q_model.save("test.onnx")
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 7)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 9
+        )
+
+        q_config = {
+            "Reshape": self.q_config,
+            "conv1": "fp32",
+            "conv2": self.q_config,
+            "Concat": self.q_config,
+            "AveragePool": self.q_config,
+        }
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3
+        )
+
+        q_config = {
+            "Reshape": self.q_config,
+            "conv1": "fp32",
+            "conv2": "fp32",
+            "Concat": self.q_config,
+            "AveragePool": self.q_config,
+        }
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 0)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+        )
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 0)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+        )
+
+        q_config = {
+            "Reshape": self.q_config,
+            "conv1": self.q_config,
+            "conv2": self.q_config,
+            "Concat": self.q_config,
+            "AveragePool": "fp32",
+        }
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["AveragePool"], 1)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6
+        )
+
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "conv1_weight": [np.uint8(10.0), np.float32(0)],
+            "conv1_output": [np.uint8(10.0), np.float32(0)],
+            "conv2_weight": [np.uint8(10.0), np.float32(0)],
+            "conv2_output": [np.uint8(10.0), np.float32(0)],
+            "concat_output": [np.uint8(10.0), np.float32(0)],
+            "avg_output": [np.uint8(10.0), np.float32(0)],
+            "shape": [np.uint8(10.0), np.float32(0)],
+            "add_out": [np.uint8(10.0), np.float32(0)],
+            "add_init": [np.uint8(10.0), np.float32(0)],
+            "reshape_output": [np.uint8(10.0), np.float32(0)],
+        }
+        q_config = {
+            "Reshape": self.q_config,
+            "conv1": self.q_config,
+            "conv2": self.q_config,
+            "Concat": self.q_config,
+            "AveragePool": self.q_config,
+        }
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["Add"], 2)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 6)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 8
+        )
+
+    def test_conv(self):
+        for op in ["Conv", "FusedConv"]:
+            A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 5, 5, 1])
+            B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 3, 3, 1])
+            C = onnx.helper.make_tensor(
+                "C", onnx.TensorProto.FLOAT, [1, 5, 5, 1], np.random.random((1, 5, 5, 1)).reshape(25).tolist()
+            )
+            D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 1])
+            conv_node = onnx.helper.make_node(
+                op, ["A", "B", "C"], ["D"], name=op, kernel_shape=[3, 3], pads=[1, 1, 1, 1]
+            )
+            initializers = [C]
+            graph = onnx.helper.make_graph([conv_node], "test_graph_1", [A, B], [D], initializer=initializers)
+            model = onnx.helper.make_model(graph)
+            q_config = {op: self.q_config}
+            quantize_params = {
+                "A": [np.uint8(10.0), np.float32(0)],
+                "B": [np.uint8(10.0), np.float32(0)],
+                "C": [np.uint8(10.0), np.float32(0)],
+                "D": [np.uint8(10.0), np.float32(0)],
+            }
+            quantizable_op_types = ["Conv"]
+            q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+            )
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2
+            )
+            q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4
+            )
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3
+            )
+
+    def test_matmul(self):
+        A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        B_init = onnx.helper.make_tensor(
+            "B", onnx.TensorProto.FLOAT, [1, 1, 5, 1], np.random.random((1, 1, 5, 1)).reshape(5).tolist()
+        )
+        C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [1, 1, 5, 1])
+        matmul_node = onnx.helper.make_node("MatMul", ["A", "B"], ["C"], name="Matmul")
+        graph = onnx.helper.make_graph([matmul_node], "test_graph_1", [A], [C], [B_init])
+        model = onnx.helper.make_model(graph)
+        q_config = {"Matmul": self.q_config}
+        quantize_params = {
+            "A": [np.uint8(10.0), np.float32(0)],
+            "B": [np.uint8(10.0), np.float32(0)],
+            "C": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["MatMul"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+
+        q_config = {"Matmul": self.q_config}
+        q_model = self.dynamic_test(model, q_config, None, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["MatMulInteger"], 1)
+
+        quantize_params = {"A": [np.float32(10.0)], "B": [np.float32(10.0)], "C": [np.float32(10.0)]}
+        with self.assertRaises(ValueError):
+            self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        with self.assertRaises(ValueError):
+            self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+
+        quantize_params = {}
+        q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["MatMulInteger"], 1)
+
+    def test_attention(self):
+        A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        node = onnx.helper.make_node("Attention", ["A", "B", "C"], ["D"], name="Attention")
+        graph = onnx.helper.make_graph([node], "test_graph_1", [A, B, C], [D])
+        model = onnx.helper.make_model(graph)
+        q_config = {"Attention": self.q_config}
+        quantize_params = {
+            "A": [np.uint8(0), np.float32(0.5)],
+            "B": [np.uint8(0), np.float32(0.5)],
+            "C": [np.uint8(0), np.float32(0.5)],
+            "D": [np.uint8(0), np.float32(0.5)],
+        }
+        quantizable_op_types = ["Attention"]
+
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QAttention"], 1)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+        )
+
+        self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        q_config = {"Attention": self.q_config}
+        q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 2
+        )
+
+        E = onnx.helper.make_tensor_value_info("E", onnx.TensorProto.INT32, [1, 1, 5, 5])
+        F = onnx.helper.make_tensor_value_info("F", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        node = onnx.helper.make_node("Attention", ["A", "B", "C", "F", "E"], ["D"], name="Attention")
+        graph = onnx.helper.make_graph([node], "test_graph_1", [A, B, C, F, E], [D])
+        model = onnx.helper.make_model(graph)
+        q_config = {"Attention": self.q_config}
+        quantize_params = {
+            "A": [np.uint8(0), np.float32(0.5)],
+            "B": [np.uint8(0), np.float32(0.5)],
+            "C": [np.uint8(0), np.float32(0.5)],
+            "D": [np.uint8(0), np.float32(0.5)],
+        }
+        quantizable_op_types = ["Attention"]
+
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+        )
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2
+        )
+
+        q_config = {"Attention": self.q_config}
+        q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 2
+        )
+
+    def test_gather(self):
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [3, 2])
+
+        matmul_weight = onnx.helper.make_tensor(
+            "matmul_weight", onnx.TensorProto.FLOAT, [2, 3], np.random.random((2, 3)).reshape(6).tolist()
+        )
+        matmul_output = onnx.helper.make_tensor_value_info("matmul_output", onnx.TensorProto.FLOAT, [3, 3])
+        matmul_node = onnx.helper.make_node("MatMul", ["input", "matmul_weight"], ["matmul_output"], name="MatMul")
+
+        gather_indices = onnx.helper.make_tensor("gather_indices", onnx.TensorProto.INT64, [1, 2], [0, 2])
+        gather_output = onnx.helper.make_tensor_value_info("gather_output", onnx.TensorProto.FLOAT, [1, 2, 3])
+        gather_node = onnx.helper.make_node(
+            "Gather", ["matmul_output", "gather_indices"], ["gather_output"], name="Gather"
+        )
+
+        initializers = [matmul_weight, gather_indices]
+        graph = onnx.helper.make_graph(
+            [matmul_node, gather_node],
+            "TestGather_test_model",
+            [input_tensor],
+            [gather_output],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7
+
+        q_config = {"Gather": self.q_config, "MatMul": self.q_config}
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "matmul_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul_output": [np.uint8(10.0), np.float32(0)],
+            "gather_output": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["Gather", "MatMul"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4
+        )
+
+        q_config = {"Gather": self.q_config, "MatMul": self.q_config}
+        q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(len(q_model.model.graph.node), 6)
+
+    def test_split(self):
+        D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [100, 2])
+        e_value = np.random.randn(2, 2).astype(np.float32)
+        E_init = onnx.helper.make_tensor("E", onnx.TensorProto.FLOAT, [2, 2], e_value.reshape(4).tolist())
+
+        matmul_node = onnx.helper.make_node("MatMul", ["D", "E"], ["A"], name="Matmul")
+
+        B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [50, 2])
+        C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [50, 2])
+        node = onnx.helper.make_node("Split", ["A"], ["B", "C"], name="Split", **{"num_outputs": 2})
+        graph = onnx.helper.make_graph([matmul_node, node], "test_graph_1", [D], [B, C], [E_init])
+        model = onnx.helper.make_model(graph)
+        q_config = {
+            "Split": self.q_config,
+            "Matmul": {
+                "weight_type": 3,
+                "activation_type": 2,
+                "per_channel": False,
+                "weight_sym": True,
+                "activation_sym": False,
+                "calibrate_method": quantization.CalibrationMethod.MinMax,
+            },
+        }
+        quantize_params = {
+            "A": [np.uint8(0), np.float32(0.5)],
+            "B": [np.uint8(0), np.float32(0.5)],
+            "C": [np.uint8(0), np.float32(0.5)],
+            "D": [np.uint8(0), np.float32(0.5)],
+            "E": [np.uint8(0), np.float32(0.5)],
+        }
+        quantizable_op_types = ["Split", "MatMul"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 5
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+
+    def test_pad(self):
+        b_value = np.array([0, 1, 1, 0, 1, 1]).astype(np.int64)
+        B_init = onnx.helper.make_tensor("B", onnx.TensorProto.INT64, [6], b_value.reshape(6).tolist())
+        B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.INT64, [6])
+        C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [1, 7, 7])
+
+        d_value = np.random.randn(1).astype(np.float32)
+        D_init = onnx.helper.make_tensor("D", onnx.TensorProto.FLOAT, [1], d_value.reshape(1).tolist())
+        D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1])
+
+        e_value = np.random.randn(1, 5, 5).astype(np.float32)
+        E_init = onnx.helper.make_tensor("E", onnx.TensorProto.FLOAT, [1, 1, 5, 5], e_value.reshape(25).tolist())
+        E = onnx.helper.make_tensor_value_info("E", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        f_value = np.random.randn(1, 3, 3).astype(np.float32)
+        F_init = onnx.helper.make_tensor("F", onnx.TensorProto.FLOAT, [1, 1, 3, 3], f_value.reshape(9).tolist())
+        F = onnx.helper.make_tensor_value_info("F", onnx.TensorProto.FLOAT, [1, 1, 3, 3])
+        for mode in ["constant", "edge", "reflect", "constant_value"]:
+            conv_node = onnx.helper.make_node(
+                "Conv", ["E", "F"], ["A"], name="Conv", kernel=[3, 3], padding=[1, 1, 1, 1]
+            )
+            if mode == "constant_value":
+                node = onnx.helper.make_node("Pad", ["A", "B", "D"], ["C"], name="Pad", mode="constant")
+                graph = onnx.helper.make_graph(
+                    [conv_node, node], "test_graph_1", [E, F, B, D], [C], [E_init, F_init, B_init, D_init]
+                )
+            else:
+                node = onnx.helper.make_node("Pad", ["A", "B"], ["C"], name="Pad", mode=mode)
+                graph = onnx.helper.make_graph(
+                    [conv_node, node], "test_graph_1", [E, F, B], [C], [E_init, F_init, B_init]
+                )
+            model = onnx.helper.make_model(graph)
+            conv_config = {
+                "weight_type": 3,
+                "activation_type": 2,
+                "per_channel": True,
+                "weight_sym": True,
+                "activation_sym": False,
+                "calibrate_method": quantization.CalibrationMethod.MinMax,
+            }
+            q_config = {"Conv": conv_config, "Pad": self.q_config}
+            quantize_params = {
+                "A": [np.uint8(10.0), np.float32(1)],
+                "C": [np.uint8(10.0), np.float32(1)],
+                "D": [np.uint8(10.0), np.float32(1)],
+                "E": [np.uint8(10.0), np.float32(1)],
+                "F": [np.uint8(10.0), np.float32(1)],
+            }
+            quantizable_op_types = ["Conv", "Pad"]
+            q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+            )
+            q_model = self.qdq_test(
+                model, q_config, quantize_params, quantizable_op_types, **{"dedicated_qdq_pair": True}
+            )
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4
+            )
+
+        node = onnx.helper.make_node("Pad", ["E", "B", "D"], ["C"], name="Pad", mode="constant")
+        graph = onnx.helper.make_graph([node], "test_graph_1", [E, B, D], [C], [E_init, B_init, D_init])
+        model = onnx.helper.make_model(graph)
+        quantize_params = {"C": [np.uint8(10.0), np.float32(0)], "E": [np.uint8(10.0), np.float32(0)]}
+        quantizable_op_types = ["Pad"]
+        q_config = {"Pad": self.q_config}
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2
+        )
+
+    def test_binary(self):
+        for op in ["Mul", "Add"]:
+            A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 10])
+            B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1])
+            C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [1, 10])
+            node = onnx.helper.make_node(op, ["A", "B"], ["C"], name=op)
+            graph = onnx.helper.make_graph([node], "test_graph_1", [A, B], [C])
+            model = onnx.helper.make_model(graph)
+            q_config = {op: self.q_config}
+            quantize_params = {
+                "A": [np.uint8(10.0), np.float32(0)],
+                "B": [np.uint8(10.0), np.float32(0)],
+                "C": [np.uint8(10.0), np.float32(0)],
+            }
+            quantizable_op_types = [op]
+            q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+            q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+            q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+            q_model = self.qdq_test(model, q_config, {}, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+    def test_relu(self):
+        A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3])
+        D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        E = onnx.helper.make_tensor_value_info("E", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        F = onnx.helper.make_tensor_value_info("F", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+
+        conv_node = onnx.helper.make_node(
+            "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1]
+        )
+        relu_node = onnx.helper.make_node("Relu", ["C"], ["D"], name="Relu")
+        add_node = onnx.helper.make_node("Add", ["D", "E"], ["F"], name="Add")
+        graph = onnx.helper.make_graph([conv_node, relu_node], "test_graph_1", [A, B], [D])
+        model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]})
+        sess_options = ort.SessionOptions()
+        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+        sess_options.optimized_model_filepath = "./onnxrt_test/optimized_model.onnx"
+        session = ort.InferenceSession(model.SerializeToString(), sess_options, providers=ort.get_available_providers())
+        tmp_model = onnx.load(sess_options.optimized_model_filepath)
+
+        q_config = {"Conv": self.q_config, "Relu": self.q_config}
+        quantize_params = {
+            "A": [np.uint8(10.0), np.float32(0)],
+            "B": [np.uint8(10.0), np.float32(0)],
+            "C": [np.uint8(10.0), np.float32(0)],
+            "D": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["Conv", "Relu"]
+        q_model = self.qlinear_test(tmp_model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(len(q_model.model.graph.node), 4)
+        q_model = self.qdq_test(tmp_model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(len(q_model.model.graph.node), 7)
+
+        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
+        session = ort.InferenceSession(model.SerializeToString(), sess_options, providers=ort.get_available_providers())
+        tmp_model = onnx.load(sess_options.optimized_model_filepath)
+        q_model = self.qlinear_test(tmp_model, q_config, quantize_params, quantizable_op_types)
+        q_model.save("test.onnx")
+        self.assertEqual(len(q_model.model.graph.node), 5)
+        q_model = self.qdq_test(tmp_model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(len(q_model.model.graph.node), 8)
+
+        graph = onnx.helper.make_graph([conv_node, relu_node, add_node], "test_graph_2", [A, B, E], [F])
+        model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]})
+        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
+        session = ort.InferenceSession(model.SerializeToString(), sess_options, providers=ort.get_available_providers())
+        tmp_model = onnx.load(sess_options.optimized_model_filepath)
+        q_model = self.qlinear_test(tmp_model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(len(q_model.model.graph.node), 5)
+        q_model = self.qdq_test(tmp_model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(len(q_model.model.graph.node), 8)
+
+    def test_clip(self):
+        A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3])
+        D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        conv_node = onnx.helper.make_node(
+            "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1]
+        )
+        clip_node = onnx.helper.make_node("Clip", ["C"], ["D"], name="Clip")
+        graph = onnx.helper.make_graph([conv_node, clip_node], "test_graph_1", [A, B], [D])
+        model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]})
+
+        sess_options = ort.SessionOptions()
+        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+        sess_options.optimized_model_filepath = "./onnxrt_test/optimized_model.onnx"
+        session = ort.InferenceSession(model.SerializeToString(), sess_options, providers=ort.get_available_providers())
+        model = onnx.load(sess_options.optimized_model_filepath)
+
+        q_config = {"Conv": self.q_config, "Clip": self.q_config}
+        quantize_params = {
+            "A": [np.uint8(10.0), np.float32(0)],
+            "B": [np.uint8(10.0), np.float32(0)],
+            "C": [np.uint8(10.0), np.float32(0)],
+            "D": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["Conv", "Clip"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3)
+
+    def test_activation(self):
+        for op in ["LeakyRelu", "Sigmoid"]:
+            B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 10])
+            A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 10])
+            node = onnx.helper.make_node(op, ["A"], ["B"], name=op)
+            graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B])
+            model = onnx.helper.make_model(graph)
+            q_config = {op: self.q_config}
+            quantize_params = {"A": [np.uint8(10.0), np.float32(0)], "B": [np.uint8(10.0), np.float32(0)]}
+            quantizable_op_types = [op]
+            q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+            )
+
+            q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2
+            )
+
+            a_value = np.random.randn(1, 10).astype(np.float32)
+            A_init = onnx.helper.make_tensor("A", onnx.TensorProto.FLOAT, [1, 10], a_value.reshape(10).tolist())
+            graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B], [A_init])
+            model = onnx.helper.make_model(graph)
+            q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+            )
+
+            q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2
+            )
+
+            q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+            q_model = self.qdq_test(model, q_config, {}, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+        for op in ["Relu"]:
+            B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 10])
+            A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 10])
+            node = onnx.helper.make_node(op, ["A"], ["B"], name=op)
+            graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B])
+            model = onnx.helper.make_model(graph)
+            q_config = {op: self.q_config}
+            quantize_params = {"A": [np.uint8(10.0), np.float32(0)], "B": [np.uint8(10.0), np.float32(0)]}
+            quantizable_op_types = [op]
+            q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+            q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+            a_value = np.random.randn(1, 10).astype(np.float32)
+            A_init = onnx.helper.make_tensor("A", onnx.TensorProto.FLOAT, [1, 10], a_value.reshape(10).tolist())
+            graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B], [A_init])
+            model = onnx.helper.make_model(graph)
+            q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+            q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+            q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+            q_model = self.qdq_test(model, q_config, {}, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0
+            )
+
+    def test_pooling(self):
+        op = "MaxPool"
+        B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 5, 5, 1])
+        A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 5, 5, 1])
+        node = onnx.helper.make_node(op, ["A"], ["B"], name=op, kernel_shape=[3, 3], pads=[1, 1, 1, 1])
+        graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B])
+        q_config = {op: self.q_config}
+        quantize_params = {"A": [np.uint8(10.0), np.float32(0)], "B": [np.uint8(10.0), np.float32(0)]}
+        quantizable_op_types = [op]
+        for opset_version in [12, 13]:
+            opset = onnx.OperatorSetIdProto()
+            opset.version = opset_version
+            model = onnx.helper.make_model(graph, opset_imports=[opset])
+            self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+            self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+
+        A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3])
+        D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        conv_node = onnx.helper.make_node(
+            "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1]
+        )
+        pool_node = onnx.helper.make_node(op, ["C"], ["D"], name=op)
+        graph = onnx.helper.make_graph([conv_node, pool_node], "test_graph_1", [A, B], [D])
+        model = onnx.helper.make_model(graph)
+
+        q_config = {"Conv": self.q_config, op: self.q_config}
+        quantize_params = {
+            "A": [np.uint8(10.0), np.float32(0)],
+            "B": [np.uint8(10.0), np.float32(0)],
+            "C": [np.uint8(10.0), np.float32(0)],
+            "D": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["Conv", op]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+
+        op = "GlobalAveragePool"
+        B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 5, 1, 1])
+        A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 5, 5, 1])
+        node = onnx.helper.make_node(op, ["A"], ["B"], name=op, kernel_shape=[3, 3], pads=[1, 1, 1, 1])
+        graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B])
+        q_config = {op: self.q_config}
+        quantize_params = {"A": [np.uint8(10.0), np.float32(0)], "B": [np.uint8(10.0), np.float32(0)]}
+        quantizable_op_types = [op]
+        for opset_version in [12, 13]:
+            opset = onnx.OperatorSetIdProto()
+            opset.version = opset_version
+            model = onnx.helper.make_model(graph, opset_imports=[opset])
+            q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+            )
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1
+            )
+            q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2
+            )
+            self.assertEqual(
+                collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2
+            )
+
+        A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5])
+        B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3])
+        D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 1, 1])
+        conv_node = onnx.helper.make_node(
+            "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1]
+        )
+        pool_node = onnx.helper.make_node(op, ["C"], ["D"], name=op)
+        graph = onnx.helper.make_graph([conv_node, pool_node], "test_graph_1", [A, B], [D])
+        model = onnx.helper.make_model(graph)
+
+        q_config = {"Conv": self.q_config, op: self.q_config}
+        quantize_params = {
+            "A": [np.uint8(10.0), np.float32(0)],
+            "B": [np.uint8(10.0), np.float32(0)],
+            "C": [np.uint8(10.0), np.float32(0)],
+            "D": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["Conv", op]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+
+    def test_exclude_node(self):
+        A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 5, 5, 1])
+        B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [3, 3, 1, 1])
+        D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 1, 3, 3])
+        conv_node = onnx.helper.make_node(
+            "Conv", ["A", "B"], ["C"], name="Conv", kernel_shape=[3, 3], pads=[1, 1, 1, 1]
+        )
+        pool_node = onnx.helper.make_node("MaxPool", ["C"], ["D"], name="MaxPool")
+        graph = onnx.helper.make_graph([conv_node, pool_node], "test_graph_1", [A, B], [D])
+        model = onnx.helper.make_model(graph)
+
+        q_config = {"Conv": self.q_config, "MaxPool": "fp32"}
+        quantize_params = {
+            "A": [np.uint8(10.0), np.float32(0)],
+            "B": [np.uint8(10.0), np.float32(0)],
+            "C": [np.uint8(10.0), np.float32(0)],
+            "D": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["Conv", "MaxPool"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        q_model.save("int8.onnx")
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3)
+
+    def test_more_direct8bit_nodes(self):
+        # test direct q8 nodes: MatMul-Flatten-Abs-Sign-ShrinK-MatMul
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 32])
+
+        matmul1_weight = onnx.helper.make_tensor(
+            "matmul1_weight", onnx.TensorProto.FLOAT, [32, 64], np.random.random((32, 64)).reshape(2048).tolist()
+        )
+        matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [1, 64])
+        matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
+
+        flatten_output = onnx.helper.make_tensor_value_info("flatten_output", onnx.TensorProto.FLOAT, [1, 64])
+        flatten_node = onnx.helper.make_node(
+            "Flatten", inputs=["matmul1_output"], outputs=["flatten_output"], axis=1, name="Flatten_1"
+        )
+
+        abs_output = onnx.helper.make_tensor_value_info("abs_output", onnx.TensorProto.FLOAT, [1, 64])
+        abs_node = onnx.helper.make_node("Abs", inputs=["flatten_output"], outputs=["abs_output"], name="Abs_2")
+
+        sign_output = onnx.helper.make_tensor_value_info("sign_output", onnx.TensorProto.FLOAT, [1, 64])
+        sign_node = onnx.helper.make_node("Sign", inputs=["abs_output"], outputs=["sign_output"], name="Sign_3")
+
+        shrink_output = onnx.helper.make_tensor_value_info("shrink_output", onnx.TensorProto.FLOAT, [1, 64])
+        shrink_node = onnx.helper.make_node(
+            "Shrink", inputs=["sign_output"], outputs=["shrink_output"], name="Shrink_4"
+        )
+
+        matmul2_weight = onnx.helper.make_tensor(
+            "matmul2_weight", onnx.TensorProto.FLOAT, [64, 2], np.random.random((64, 2)).reshape(128).tolist()
+        )
+        matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [1, 2])
+        matmul2_node = onnx.helper.make_node(
+            "MatMul", ["shrink_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_5"
+        )
+
+        initializers = [matmul1_weight, matmul2_weight]
+        graph = onnx.helper.make_graph(
+            [matmul1_node, flatten_node, abs_node, sign_node, shrink_node, matmul2_node],
+            "TestMoreDirect8_test_model",
+            [input_tensor],
+            [matmul2_output],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7
+
+        q_config = {
+            "Matmul_0": self.q_config,
+            "Flatten_1": self.q_config,
+            "Abs_2": self.q_config,
+            "Sign_3": self.q_config,
+            "Shrink_4": self.q_config,
+            "Matmul_5": self.q_config,
+        }
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "matmul1_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul1_output": [np.uint8(10.0), np.float32(0)],
+            "flatten_output": [np.uint8(10.0), np.float32(0)],
+            "abs_output": [np.uint8(10.0), np.float32(0)],
+            "sign_output": [np.uint8(10.0), np.float32(0)],
+            "shrink_output": [np.uint8(10.0), np.float32(0)],
+            "matmul2_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul2_output": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["MatMul", "Flatten", "Abs", "Sign", "Shrink"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        q_model.save("qdq.onnx")
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 9
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 7)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+    def test_expand(self):
+        # test expand nodes: MatMul-Expand-MatMul
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [3, 2])
+
+        matmul1_weight = onnx.helper.make_tensor(
+            "matmul1_weight", onnx.TensorProto.FLOAT, [2, 1], np.random.random((2, 1)).reshape(2).tolist()
+        )
+        matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [3, 1])
+        matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
+
+        expand_new_shape = onnx.helper.make_tensor("expand_new_shape", onnx.TensorProto.INT64, [2], [3, 4])
+        expand_output = onnx.helper.make_tensor_value_info("expand_output", onnx.TensorProto.FLOAT, [3, 4])
+        expand_node = onnx.helper.make_node(
+            "Expand", ["matmul1_output", "expand_new_shape"], ["expand_output"], name="Expand_1"
+        )
+
+        matmul2_weight = onnx.helper.make_tensor(
+            "matmul2_weight", onnx.TensorProto.FLOAT, [4, 2], np.random.random((4, 2)).reshape(8).tolist()
+        )
+        matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [3, 2])
+        matmul2_node = onnx.helper.make_node(
+            "MatMul", ["expand_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
+        )
+
+        initializers = [matmul1_weight, matmul2_weight, expand_new_shape]
+        graph = onnx.helper.make_graph(
+            [matmul1_node, expand_node, matmul2_node],
+            "TestExpand_test_model",
+            [input_tensor],
+            [matmul2_output],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7
+
+        q_config = {
+            "Matmul_0": self.q_config,
+            "Expand_1": self.q_config,
+            "Matmul_2": self.q_config,
+        }
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "matmul1_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul1_output": [np.uint8(10.0), np.float32(0)],
+            "matmul2_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul2_output": [np.uint8(10.0), np.float32(0)],
+            "expand_output": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["MatMul", "Expand"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+    def test_slice(self):
+        # test slice nodes: MatMul-Slice-MatMul
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [5, 4, 1])
+
+        matmul1_weight = onnx.helper.make_tensor(
+            "matmul1_weight", onnx.TensorProto.FLOAT, [1, 3], np.random.random((1, 3)).reshape(3).tolist()
+        )
+        matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [5, 4, 3])
+        matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
+
+        slice_starts = onnx.helper.make_tensor("slice_starts", onnx.TensorProto.INT64, [2], [0, 0])
+        slice_ends = onnx.helper.make_tensor("slice_ends", onnx.TensorProto.INT64, [2], [3, 4])
+        slice_axes = onnx.helper.make_tensor("slice_axes", onnx.TensorProto.INT64, [2], [0, 1])
+        slice_steps = onnx.helper.make_tensor("slice_steps", onnx.TensorProto.INT64, [2], [1, 1])
+        slice_output = onnx.helper.make_tensor_value_info("slice_output", onnx.TensorProto.FLOAT, [3, 4, 3])
+        slice_node = onnx.helper.make_node(
+            "Slice",
+            ["matmul1_output", "slice_starts", "slice_ends", "slice_axes", "slice_steps"],
+            ["slice_output"],
+            name="Slice_1",
+        )
+
+        matmul2_weight = onnx.helper.make_tensor(
+            "matmul2_weight", onnx.TensorProto.FLOAT, [3, 2], np.random.random((3, 2)).reshape(6).tolist()
+        )
+        matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [3, 4, 2])
+        matmul2_node = onnx.helper.make_node(
+            "MatMul", ["slice_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
+        )
+
+        initializers = [matmul1_weight, matmul2_weight, slice_starts, slice_ends, slice_axes, slice_steps]
+        graph = onnx.helper.make_graph(
+            [matmul1_node, slice_node, matmul2_node],
+            "TestSlice_test_model",
+            [input_tensor],
+            [matmul2_output],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7
+
+        q_config = {"Matmul_0": self.q_config, "Slice_1": self.q_config, "Matmul_2": self.q_config}
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "matmul1_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul1_output": [np.uint8(10.0), np.float32(0)],
+            "matmul2_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul2_output": [np.uint8(10.0), np.float32(0)],
+            "slice_output": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["MatMul", "Slice"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+    def test_mod(self):
+        # test mode nodes: MatMul-Mod-MatMul
+        #                  MatMul-/
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [2, 3])
+
+        matmul1_weight = onnx.helper.make_tensor(
+            "matmul1_weight", onnx.TensorProto.FLOAT, [3, 4], np.random.random((3, 4)).reshape(12).tolist()
+        )
+        matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [2, 4])
+        matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
+
+        matmul2_weight = onnx.helper.make_tensor(
+            "matmul2_weight", onnx.TensorProto.FLOAT, [3, 4], np.random.random((3, 4)).reshape(12).tolist()
+        )
+        matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [2, 4])
+        matmul2_node = onnx.helper.make_node("MatMul", ["input", "matmul2_weight"], ["matmul2_output"], name="Matmul_1")
+
+        mod_output = onnx.helper.make_tensor_value_info("mod_output", onnx.TensorProto.FLOAT, [2, 4])
+        mod_node = onnx.helper.make_node("Mod", ["matmul1_output", "matmul2_output"], ["mod_output"], name="Mod_2")
+
+        matmul3_weight = onnx.helper.make_tensor(
+            "matmul3_weight", onnx.TensorProto.FLOAT, [4, 2], np.random.random((4, 2)).reshape(8).tolist()
+        )
+        matmul3_output = onnx.helper.make_tensor_value_info("matmul3_output", onnx.TensorProto.FLOAT, [2, 2])
+        matmul3_node = onnx.helper.make_node(
+            "MatMul", ["mod_output", "matmul3_weight"], ["matmul3_output"], name="Matmul_3"
+        )
+
+        initializers = [matmul1_weight, matmul2_weight, matmul3_weight]
+        graph = onnx.helper.make_graph(
+            [matmul1_node, matmul2_node, mod_node, matmul3_node],
+            "TestMod_test_model",
+            [input_tensor],
+            [matmul3_output],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 14)])
+        model.ir_version = 7
+
+        q_config = {
+            "Matmul_0": self.q_config,
+            "Matmul_1": self.q_config,
+            "Mod_2": self.q_config,
+            "Matmul_3": self.q_config,
+        }
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "matmul1_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul1_output": [np.uint8(10.0), np.float32(0)],
+            "matmul2_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul2_output": [np.uint8(10.0), np.float32(0)],
+            "mod_output": [np.uint8(10.0), np.float32(0)],
+            "matmul3_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul3_output": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["MatMul", "Mod"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        q_model.save("test.onnx")
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 8
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 5)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+    def test_reducemin_reducemax(self):
+        # MatMul-ReduceMin-MatMul
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [3, 2, 3])
+
+        matmul1_weight = onnx.helper.make_tensor(
+            "matmul1_weight", onnx.TensorProto.FLOAT, [3, 2], np.random.random((3, 2)).reshape(6).tolist()
+        )
+        matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [3, 2, 2])
+        matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
+
+        reducemin_output = onnx.helper.make_tensor_value_info("reducemin_output", onnx.TensorProto.FLOAT, [3, 1, 2])
+        reducemin_node = onnx.helper.make_node(
+            "ReduceMin",
+            inputs=["matmul1_output"],
+            outputs=["reducemin_output"],
+            axes=[1],
+            keepdims=1,
+            name="Reducemin_1",
+        )
+
+        matmul2_weight = onnx.helper.make_tensor(
+            "matmul2_weight", onnx.TensorProto.FLOAT, [2, 3], np.random.random((2, 3)).reshape(6).tolist()
+        )
+        matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [3, 1, 3])
+        matmul2_node = onnx.helper.make_node(
+            "MatMul", ["reducemin_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
+        )
+
+        initializers = [matmul1_weight, matmul2_weight]
+        graph = onnx.helper.make_graph(
+            [matmul1_node, reducemin_node, matmul2_node],
+            "TestReduceMin_test_model",
+            [input_tensor],
+            [matmul2_output],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7
+
+        q_config = {
+            "Matmul_0": self.q_config,
+            "Reducemin_1": self.q_config,
+            "Matmul_2": self.q_config,
+        }
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "matmul1_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul1_output": [np.uint8(10.0), np.float32(0)],
+            "reducemin_output": [np.uint8(10.0), np.float32(0)],
+            "matmul2_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul2_output": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["MatMul", "ReduceMin"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+        # MatMul-ReduceMax-MatMul
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [3, 2, 3])
+
+        matmul1_weight = onnx.helper.make_tensor(
+            "matmul1_weight", onnx.TensorProto.FLOAT, [3, 2], np.random.random((3, 2)).reshape(6).tolist()
+        )
+        matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [3, 2, 2])
+        matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
+
+        reducemax_output = onnx.helper.make_tensor_value_info("reducemax_output", onnx.TensorProto.FLOAT, [3, 1, 2])
+        reducemax_node = onnx.helper.make_node(
+            "ReduceMax",
+            inputs=["matmul1_output"],
+            outputs=["reducemax_output"],
+            axes=[1],
+            keepdims=1,
+            name="Reducemax_1",
+        )
+
+        matmul2_weight = onnx.helper.make_tensor(
+            "matmul2_weight", onnx.TensorProto.FLOAT, [2, 3], np.random.random((2, 3)).reshape(6).tolist()
+        )
+        matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [3, 1, 3])
+        matmul2_node = onnx.helper.make_node(
+            "MatMul", ["reducemax_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
+        )
+
+        initializers = [matmul1_weight, matmul2_weight]
+        graph = onnx.helper.make_graph(
+            [matmul1_node, reducemax_node, matmul2_node],
+            "TestReduceMax_test_model",
+            [input_tensor],
+            [matmul2_output],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7
+
+        q_config = {
+            "Matmul_0": self.q_config,
+            "Reducemax_1": self.q_config,
+            "Matmul_2": self.q_config,
+        }
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "matmul1_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul1_output": [np.uint8(10.0), np.float32(0)],
+            "reducemax_output": [np.uint8(10.0), np.float32(0)],
+            "matmul2_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul2_output": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["MatMul", "ReduceMax"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+    def test_tile(self):
+        # test Tile nodes: MatMul-Tile-MatMul
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [2, 3, 4, 1])
+
+        matmul1_weight = onnx.helper.make_tensor(
+            "matmul1_weight", onnx.TensorProto.FLOAT, [1, 5], np.random.random((1, 5)).reshape(5).tolist()
+        )
+        matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [2, 3, 4, 5])
+        matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
+
+        repeats = onnx.helper.make_tensor("repeats", onnx.TensorProto.INT64, [4], [2, 2, 2, 2])
+        tile_output = onnx.helper.make_tensor_value_info("tile_output", onnx.TensorProto.FLOAT, [4, 6, 8, 10])
+        tile_node = onnx.helper.make_node(
+            "Tile",
+            ["matmul1_output", "repeats"],
+            ["tile_output"],
+            name="Tile_1",
+        )
+
+        matmul2_weight = onnx.helper.make_tensor(
+            "matmul2_weight", onnx.TensorProto.FLOAT, [10, 1], np.random.random((10, 1)).reshape(10).tolist()
+        )
+        matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [4, 6, 8, 1])
+        matmul2_node = onnx.helper.make_node(
+            "MatMul", ["tile_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
+        )
+
+        initializers = [matmul1_weight, matmul2_weight, repeats]
+        graph = onnx.helper.make_graph(
+            [matmul1_node, tile_node, matmul2_node],
+            "TestTile_test_model",
+            [input_tensor],
+            [matmul2_output],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7
+
+        q_config = {"Matmul_0": self.q_config, "Tile_1": self.q_config, "Matmul_2": self.q_config}
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "matmul1_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul1_output": [np.uint8(10.0), np.float32(0)],
+            "matmul2_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul2_output": [np.uint8(10.0), np.float32(0)],
+            "tile_output": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["MatMul", "Tile"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+    def test_centercroppad(self):
+        # test CenterCropPad nodes: MatMul-CenterCropPad-MatMul
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [20, 10, 1])
+
+        matmul1_weight = onnx.helper.make_tensor(
+            "matmul1_weight", onnx.TensorProto.FLOAT, [1, 3], np.random.random((1, 3)).reshape(3).tolist()
+        )
+        matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [20, 10, 3])
+        matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
+
+        centercroppad_output = onnx.helper.make_tensor_value_info(
+            "centercroppad_output", onnx.TensorProto.FLOAT, [10, 7, 3]
+        )
+        shape = onnx.helper.make_tensor("shape", onnx.TensorProto.INT64, [3], [10, 7, 3])
+        centercroppad_node = onnx.helper.make_node(
+            "CenterCropPad",
+            ["matmul1_output", "shape"],
+            ["centercroppad_output"],
+            name="Centercroppad_1",
+        )
+
+        matmul2_weight = onnx.helper.make_tensor(
+            "matmul2_weight", onnx.TensorProto.FLOAT, [3, 1], np.random.random((3, 1)).reshape(3).tolist()
+        )
+        matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [10, 7, 1])
+        matmul2_node = onnx.helper.make_node(
+            "MatMul", ["centercroppad_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
+        )
+
+        initializers = [matmul1_weight, shape, matmul2_weight]
+        graph = onnx.helper.make_graph(
+            [matmul1_node, centercroppad_node, matmul2_node],
+            "TestCenterCropPad_test_model",
+            [input_tensor],
+            [matmul2_output],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 18)])
+        model.ir_version = 8
+
+        q_config = {
+            "Matmul_0": self.q_config,
+            "Centercroppad_1": self.q_config,
+            "Matmul_2": self.q_config,
+        }
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "matmul1_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul1_output": [np.uint8(10.0), np.float32(0)],
+            "matmul2_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul2_output": [np.uint8(10.0), np.float32(0)],
+            "centercroppad_output": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["MatMul", "CenterCropPad"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+    def test_gathernd(self):
+        # test GatherND nodes: MatMul-GatherND-MatMul
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [2, 2, 1])
+
+        matmul1_weight = onnx.helper.make_tensor(
+            "matmul1_weight", onnx.TensorProto.FLOAT, [1, 2], np.random.random((1, 2)).reshape(2).tolist()
+        )
+        matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [2, 2, 2])
+        matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
+
+        gathernd_output = onnx.helper.make_tensor_value_info("gathernd_output", onnx.TensorProto.FLOAT, [2, 1, 2])
+        indices = onnx.helper.make_tensor("indices", onnx.TensorProto.INT64, [2, 1, 2], [0, 1, 1, 0])
+        gathernd_node = onnx.helper.make_node(
+            "GatherND",
+            ["matmul1_output", "indices"],
+            ["gathernd_output"],
+            name="Gathernd_1",
+        )
+
+        matmul2_weight = onnx.helper.make_tensor(
+            "matmul2_weight", onnx.TensorProto.FLOAT, [2, 1], np.random.random((2, 1)).reshape(2).tolist()
+        )
+        matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [2, 1, 1])
+        matmul2_node = onnx.helper.make_node(
+            "MatMul", ["gathernd_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
+        )
+
+        initializers = [matmul1_weight, indices, matmul2_weight]
+        graph = onnx.helper.make_graph(
+            [matmul1_node, gathernd_node, matmul2_node],
+            "TestGatherND_test_model",
+            [input_tensor],
+            [matmul2_output],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7
+
+        q_config = {
+            "Matmul_0": self.q_config,
+            "Matmul_2": self.q_config,
+            "Gathernd_1": self.q_config,
+        }
+
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "matmul1_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul1_output": [np.uint8(10.0), np.float32(0)],
+            "matmul2_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul2_output": [np.uint8(10.0), np.float32(0)],
+            "gathernd_output": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["MatMul", "GatherND"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+    def test_gatherelements(self):
+        # test GatherElements nodes: MatMul-GatherElements-MatMul
+        input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [3, 1])
+
+        matmul1_weight = onnx.helper.make_tensor(
+            "matmul1_weight", onnx.TensorProto.FLOAT, [1, 3], np.random.random((1, 3)).reshape(3).tolist()
+        )
+        matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [3, 3])
+        matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
+
+        gatherelements_output = onnx.helper.make_tensor_value_info(
+            "gatherelements_output", onnx.TensorProto.FLOAT, [2, 3]
+        )
+        indices = onnx.helper.make_tensor("indices", onnx.TensorProto.INT64, [2, 3], [-1, -2, 0, -2, 0, 0])
+        gathernd_node = onnx.helper.make_node(
+            "GatherElements",
+            ["matmul1_output", "indices"],
+            ["gatherelements_output"],
+            name="Gatherelements_1",
+        )
+
+        matmul2_weight = onnx.helper.make_tensor(
+            "matmul2_weight", onnx.TensorProto.FLOAT, [3, 1], np.random.random((3, 1)).reshape(3).tolist()
+        )
+        matmul2_output = onnx.helper.make_tensor_value_info("matmul2_output", onnx.TensorProto.FLOAT, [2, 1])
+        matmul2_node = onnx.helper.make_node(
+            "MatMul", ["gatherelements_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
+        )
+
+        initializers = [matmul1_weight, indices, matmul2_weight]
+        graph = onnx.helper.make_graph(
+            [matmul1_node, gathernd_node, matmul2_node],
+            "TestGatherElements_test_model",
+            [input_tensor],
+            [matmul2_output],
+            initializer=initializers,
+        )
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+        model.ir_version = 7
+
+        q_config = {
+            "Matmul_0": self.q_config,
+            "Matmul_2": self.q_config,
+            "Gatherelements_1": self.q_config,
+        }
+
+        quantize_params = {
+            "input": [np.uint8(10.0), np.float32(0)],
+            "matmul1_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul1_output": [np.uint8(10.0), np.float32(0)],
+            "matmul2_weight": [np.uint8(10.0), np.float32(0)],
+            "matmul2_output": [np.uint8(10.0), np.float32(0)],
+            "gatherelements_output": [np.uint8(10.0), np.float32(0)],
+        }
+        quantizable_op_types = ["MatMul", "GatherElements"]
+        q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+        q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
+        self.assertEqual(
+            collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6
+        )
+        self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
+        session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(session)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/quantization/post_training_quant/test_post_training_quant.py b/test/quantization/post_training_quant/test_post_training_quant.py
new file mode 100644
index 000000000..2720ff69d
--- /dev/null
+++ b/test/quantization/post_training_quant/test_post_training_quant.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import glob
+import os
+import shutil
+import unittest
+from unittest import mock
+
+import numpy as np
+import onnx
+import onnxruntime as ort
+from optimum.exporters.onnx import main_export
+
+from onnx_neural_compressor import data_reader, quantization
+from onnx_neural_compressor.quantization import config
+
+from typing import Callable, Dict, List, Optional, Union  # isort: skip
+
+
+def fake_eval(model, eval_result_lst):
+    acc = eval_result_lst.pop(0)
+    return acc
+
+
+class DataReader(data_reader.CalibrationDataReader):
+
+    def __init__(self, model):
+        model = onnx.load(model)
+        batch_size = 1
+        sequence_length = 1
+        self.data = {
+            "input_ids": np.random.randint(10, size=(batch_size, sequence_length)).astype("int64"),
+            "attention_mask": np.zeros((batch_size, sequence_length)).astype("int64"),
+        }
+        for inp in model.graph.input:
+            if inp.name in self.data:
+                continue
+            if inp.name == "position_ids":
+                # model is exported with optimum >= 1.14.0 with new input 'position_ids'
+                self.data[inp.name] = np.random.randint(10, size=(batch_size, sequence_length)).astype("int64")
+
+        self.enum_data = None
+
+    def get_next(self):
+        if self.enum_data is None:
+            self.enum_data = iter([self.data])
+        return next(self.enum_data, None)
+
+    def rewind(self):
+        self.enum_data = None
+
+
+def _count_op_num(model, optype):
+    num = 0
+    for node in model.graph.node:
+        if node.op_type == optype:
+            num += 1
+    return num
+
+
+class TestPostTrainingQuant(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(self):
+        main_export(
+            "hf-internal-testing/tiny-random-gptj",
+            output="model",
+        )
+        self.model = glob.glob(os.path.join("./model", "*.onnx"))[0]
+        self.data_reader = DataReader(self.model)
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree("./model", ignore_errors=True)
+        os.remove("quant.onnx")
+        os.remove("quant.onnx_data")
+
+    def test_static_quant(self):
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QInt8,
+            per_channel=True,
+            quant_last_matmul=True,
+            calibrate_method=quantization.CalibrationMethod.Entropy,
+            extra_options={"WeightSymmetric": True, "ActivationSymmetric": False},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+        q_model = onnx.load("quant.onnx")
+        qmatmul_num_enable_last = _count_op_num(q_model, "QLinearMatMul")
+
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QInt8,
+            calibrate_method=quantization.CalibrationMethod.Percentile,
+            per_channel=True,
+            quant_last_matmul=False,
+            extra_options={"WeightSymmetric": True, "ActivationSymmetric": False},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+        q_model = onnx.load("quant.onnx")
+        node_num_basic = len(q_model.graph.node)
+        qmatmul_num_disable_last = _count_op_num(q_model, "QLinearMatMul")
+
+        # check quant_last_matmul work
+        self.assertEqual(qmatmul_num_enable_last, qmatmul_num_disable_last + 1)
+
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QUInt8,
+            per_channel=False,
+            quant_last_matmul=False,
+            extra_options={"WeightSymmetric": False, "ActivationSymmetric": True},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg, ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED)
+        q_model = onnx.load("quant.onnx")
+        node_num_extended = len(q_model.graph.node)
+
+        # check graph optimization work
+        self.assertGreater(node_num_basic, node_num_extended)
+
+        # check op_types_to_quantize work
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QUInt8,
+            per_channel=False,
+            quant_last_matmul=False,
+            op_types_to_quantize=["MatMul", "Gather"],
+            extra_options={"WeightSymmetric": False, "ActivationSymmetric": True},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+        q_model = onnx.load("quant.onnx")
+        self.assertEqual(_count_op_num(q_model, "QLinearAdd"), 0)
+        self.assertGreater(_count_op_num(q_model, "QLinearMatMul"), 0)
+
+        # check nodes_to_quantize work
+        quantizable_matmuls = [i.name.split("_quant")[0] for i in q_model.graph.node if i.op_type == "QLinearMatMul"]
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QUInt8,
+            nodes_to_quantize=[quantizable_matmuls[0]],
+            per_channel=False,
+            quant_last_matmul=False,
+            op_types_to_quantize=["MatMul", "Gather"],
+            extra_options={"WeightSymmetric": False, "ActivationSymmetric": True},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+        q_model = onnx.load("quant.onnx")
+        self.assertEqual(_count_op_num(q_model, "QLinearMatMul"), 1)
+
+        # check nodes_to_exclude work
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QUInt8,
+            nodes_to_exclude=[quantizable_matmuls[0]],
+            per_channel=False,
+            quant_last_matmul=False,
+            extra_options={"WeightSymmetric": False, "ActivationSymmetric": True},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+        q_model = onnx.load("quant.onnx")
+        self.assertEqual(_count_op_num(q_model, "QLinearMatMul"), qmatmul_num_disable_last - 1)
+
+    def test_dynamic_quant(self):
+        cfg = config.DynamicQuantConfig(
+            weight_type=quantization.QuantType.QInt8,
+            per_channel=True,
+            quant_last_matmul=False,
+            extra_options={"WeightSymmetric": True, "ActivationSymmetric": False},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+
+        cfg = config.DynamicQuantConfig(
+            weight_type=quantization.QuantType.QUInt8,
+            per_channel=False,
+            quant_last_matmul=False,
+            extra_options={"WeightSymmetric": False, "ActivationSymmetric": True},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg, ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/quantization/post_training_quant/test_quant_utils.py b/test/quantization/post_training_quant/test_quant_utils.py
new file mode 100644
index 000000000..e98c6104d
--- /dev/null
+++ b/test/quantization/post_training_quant/test_quant_utils.py
@@ -0,0 +1,62 @@
+import unittest
+
+import numpy as np
+import onnx
+
+from onnx_neural_compressor.algorithms import utility as quant_utils
+
+
+class TestQuantUtility(unittest.TestCase):
+
+    def test_pad_tensor(self):
+        data = np.random.random((100, 32))
+        group_size = 32
+        k_blocks = (100 - 1) // 32 + 1
+        pad_data = quant_utils.pad_tensor(data, group_size, k_blocks)
+        self.assertEqual(pad_data.shape, (k_blocks * group_size, 32))
+
+    def test_quant_dequant_data(self):
+        data = np.random.random((100, 32))
+        qrange = quant_utils.get_qmin_qmax_for_qType(
+            qType=onnx.TensorProto.UINT8,
+            reduce_range=False,
+            sym=True,
+        )
+        self.assertEqual(qrange[0], 0)
+        self.assertEqual(qrange[1], 255)
+
+        rmin = np.min(np.min(data), 0)
+        rmax = np.max(np.max(data), 0)
+
+        _, _, zero_point, scale, quantized_data = quant_utils.quantize_data(
+            data=data,
+            qType=onnx.TensorProto.UINT8,
+            sym=True,
+        )
+
+        dq_data = quant_utils.dequantize_data(
+            tensor_value=quantized_data,
+            scale_value=scale,
+            zo_value=zero_point,
+        )
+        self.assertLess(np.max(np.abs(dq_data - data)), 0.005)
+
+        _, _, zero_point, scale, quantized_data = quant_utils.quantize_data_per_channel(
+            data=data,
+            qType=onnx.TensorProto.UINT8,
+            sym=True,
+            axis=1,
+        )
+
+        dq_data = quant_utils.dequantize_data(
+            tensor_value=quantized_data,
+            scale_value=scale,
+            zo_value=zero_point,
+            axis=1,
+        )
+
+        self.assertLess(np.max(np.abs(dq_data - data)), 0.005)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/quantization/test_autotune.py b/test/quantization/test_autotune.py
index 0e86c64b9..dd6ddf0db 100644
--- a/test/quantization/test_autotune.py
+++ b/test/quantization/test_autotune.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-#
 # Copyright (c) 2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,8 +24,8 @@
 import onnxruntime as ort
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, data_reader
-from onnx_neural_compressor.quantization import tuning
+from onnx_neural_compressor import data_reader, quantization
+from onnx_neural_compressor.quantization import config, tuning
 
 from typing import Callable, Dict, List, Optional, Union  # isort: skip
 
@@ -86,7 +84,7 @@ def setUpClass(self):
     def tearDownClass(self):
         shutil.rmtree("./gptj", ignore_errors=True)
 
-    @mock.patch("logging.Logger.warning")
+    @mock.patch("onnx_neural_compressor.logger.warning")
     def test_auto_tune_warning(self, mock_warning):
         acc_data = iter([1.0, 0.8, 0.99, 1.0, 0.99, 0.99])
 
@@ -157,16 +155,20 @@ def eval_fn_wrapper(model):
         self.assertIsNotNone(best_model)
 
     def test_rtn_auto_tune(self):
-        acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9])
-
-        def eval_acc_fn(model) -> float:
-            return next(acc_data)
-
-        perf_data = iter([1.0, 0.99, 0.99])
-
-        def eval_perf_fn(model) -> float:
-            return next(perf_data)
+        eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.9])
+        with self.assertRaises(SystemExit):
+            custom_tune_config = tuning.TuningConfig(
+                config_set=[config.RTNConfig(weight_group_size=32), config.RTNConfig(weight_group_size=64)]
+            )
+            best_model = tuning.autotune(
+                model_input=self.gptj,
+                tune_config=custom_tune_config,
+                eval_fn=eval_acc_fn,
+                calibration_data_reader=self.data_reader,
+            )
 
+        eval_perf_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99, 0.99])
+        eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99])
         eval_fns = [
             {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"},
             {
@@ -174,24 +176,12 @@ def eval_perf_fn(model) -> float:
                 "weight": 0.5,
             },
         ]
-
         evaluator = _create_evaluator_for_eval_fns(eval_fns)
 
         def eval_fn_wrapper(model):
             result = evaluator.evaluate(model)
             return result
 
-        custom_tune_config = tuning.TuningConfig(
-            config_set=[config.RTNConfig(weight_group_size=32), config.RTNConfig(weight_group_size=64)]
-        )
-        best_model = tuning.autotune(
-            model_input=self.gptj,
-            tune_config=custom_tune_config,
-            eval_fn=eval_acc_fn,
-            calibration_data_reader=self.data_reader,
-        )
-        self.assertIsNone(best_model)
-
         custom_tune_config = tuning.TuningConfig(config_set=[config.RTNConfig(weight_group_size=[32, 64])])
         best_model = tuning.autotune(
             model_input=self.gptj,
@@ -199,26 +189,32 @@ def eval_fn_wrapper(model):
             eval_fn=eval_fn_wrapper,
             calibration_data_reader=self.data_reader,
         )
+
         self.assertEqual(len(evaluator.eval_fn_registry), 2)
         self.assertIsNotNone(best_model)
+
         op_names = [
             i.name
             for i in best_model.graph.node
-            if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(4, 32))
+            if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(4, 64))
         ]
         self.assertTrue(len(op_names) > 0)
 
     def test_awq_auto_tune(self):
-        acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9])
-
-        def eval_acc_fn(model) -> float:
-            return next(acc_data)
-
-        perf_data = iter([1.0, 0.99, 0.99])
-
-        def eval_perf_fn(model) -> float:
-            return next(perf_data)
+        eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.9])
+        with self.assertRaises(SystemExit):
+            custom_tune_config = tuning.TuningConfig(
+                config_set=[config.AWQConfig(weight_group_size=32), config.AWQConfig(weight_group_size=64)]
+            )
+            best_model = tuning.autotune(
+                model_input=self.gptj,
+                tune_config=custom_tune_config,
+                eval_fn=eval_acc_fn,
+                calibration_data_reader=self.data_reader,
+            )
 
+        eval_perf_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99, 0.99])
+        eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99, 0.99])
         eval_fns = [
             {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"},
             {
@@ -226,24 +222,12 @@ def eval_perf_fn(model) -> float:
                 "weight": 0.5,
             },
         ]
-
         evaluator = _create_evaluator_for_eval_fns(eval_fns)
 
         def eval_fn_wrapper(model):
             result = evaluator.evaluate(model)
             return result
 
-        custom_tune_config = tuning.TuningConfig(
-            config_set=[config.AWQConfig(weight_group_size=32), config.AWQConfig(weight_group_size=64)]
-        )
-        best_model = tuning.autotune(
-            model_input=self.gptj,
-            tune_config=custom_tune_config,
-            eval_fn=eval_acc_fn,
-            calibration_data_reader=self.data_reader,
-        )
-        self.assertIsNone(best_model)
-
         custom_tune_config = tuning.TuningConfig(config_set=[config.AWQConfig(weight_group_size=[32, 64])])
         best_model = tuning.autotune(
             model_input=self.gptj,
@@ -261,16 +245,20 @@ def eval_fn_wrapper(model):
         self.assertTrue(len(op_names) > 0)
 
     def test_gptq_auto_tune(self):
-        acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9])
-
-        def eval_acc_fn(model) -> float:
-            return next(acc_data)
-
-        perf_data = iter([1.0, 0.99, 0.99])
-
-        def eval_perf_fn(model) -> float:
-            return next(perf_data)
+        eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.9])
+        with self.assertRaises(SystemExit):
+            custom_tune_config = tuning.TuningConfig(
+                config_set=[config.GPTQConfig(weight_group_size=32), config.GPTQConfig(weight_group_size=64)]
+            )
+            best_model = tuning.autotune(
+                model_input=self.gptj,
+                tune_config=custom_tune_config,
+                eval_fn=eval_acc_fn,
+                calibration_data_reader=self.data_reader,
+            )
 
+        eval_perf_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99, 0.99])
+        eval_acc_fn = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99, 0.99])
         eval_fns = [
             {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"},
             {
@@ -284,17 +272,6 @@ def eval_fn_wrapper(model):
             result = evaluator.evaluate(model)
             return result
 
-        custom_tune_config = tuning.TuningConfig(
-            config_set=[config.GPTQConfig(weight_group_size=32), config.GPTQConfig(weight_group_size=64)]
-        )
-        best_model = tuning.autotune(
-            model_input=self.gptj,
-            tune_config=custom_tune_config,
-            eval_fn=eval_acc_fn,
-            calibration_data_reader=self.data_reader,
-        )
-        self.assertIsNone(best_model)
-
         custom_tune_config = tuning.TuningConfig(config_set=[config.GPTQConfig(weight_group_size=[32, 64])])
         best_model = tuning.autotune(
             model_input=self.gptj,
@@ -330,7 +307,6 @@ def test_woq_auto_tune(self):
             if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(8, 32))
         ]
         self.assertTrue(len(op_names) > 0)
-
         partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.81, 1.0, 0.99, 0.99])
 
         custom_tune_config = tuning.TuningConfig(config_set=config.get_woq_tuning_config())
@@ -370,6 +346,120 @@ def test_woq_auto_tune(self):
         ]
         self.assertTrue(len(op_names) > 0)
 
+    def test_dynamic_auto_tune(self):
+        partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99, 0.81, 1.0, 0.99])
+
+        custom_tune_config = tuning.TuningConfig(config_set=config.DynamicQuantConfig.get_config_set_for_tuning())
+        best_model = tuning.autotune(
+            model_input=self.gptj,
+            tune_config=custom_tune_config,
+            eval_fn=partial_fake_eval,
+        )
+        self.assertIsNotNone(best_model)
+
+    def test_dynamic_custom_auto_tune(self):
+        partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99])
+        custom_tune_config = tuning.TuningConfig(
+            config_set=config.DynamicQuantConfig(
+                per_channel=[True, False],
+                execution_provider="CPUExecutionProvider",
+            )
+        )
+        best_model = tuning.autotune(
+            model_input=self.gptj,
+            tune_config=custom_tune_config,
+            eval_fn=partial_fake_eval,
+            calibration_data_reader=self.data_reader,
+        )
+
+        optypes = [i.op_type for i in best_model.graph.node]
+        self.assertTrue("DynamicQuantizeLinear" in optypes)
+        self.assertTrue("MatMulInteger" in optypes)
+        ort.InferenceSession(best_model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(best_model)
+
+        partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.82, 0.81, 1.0, 0.99])
+        for execution_provider in ["DmlExecutionProvider", "TensorrtExecutionProvider"]:
+            with self.assertRaises(SystemExit):
+                custom_tune_config = tuning.TuningConfig(
+                    config_set=config.DynamicQuantConfig(
+                        per_channel=[True, False],
+                        execution_provider=execution_provider,
+                    )
+                )
+                best_model = tuning.autotune(
+                    model_input=self.gptj,
+                    tune_config=custom_tune_config,
+                    eval_fn=partial_fake_eval,
+                    calibration_data_reader=self.data_reader,
+                )
+
+    def test_static_default_auto_tune(self):
+        partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.99])
+
+        custom_tune_config = tuning.TuningConfig(
+            config_set=config.StaticQuantConfig.get_config_set_for_tuning(
+                execution_provider="TensorrtExecutionProvider",
+                quant_format=quantization.QuantFormat.QDQ,
+            )
+        )
+        best_model = tuning.autotune(
+            model_input=self.gptj,
+            tune_config=custom_tune_config,
+            eval_fn=partial_fake_eval,
+            calibration_data_reader=self.data_reader,
+        )
+        optypes = [i.op_type for i in best_model.graph.node]
+        self.assertTrue("QLinearMatMul" not in optypes)
+        self.assertTrue("QuantizeLinear" in optypes)
+        self.assertTrue("MatMul" in optypes)
+        ort.InferenceSession(best_model.SerializeToString(), providers=["TensorrtExecutionProvider"])
+        self.assertIsNotNone(best_model)
+
+    def test_static_custom_auto_tune(self):
+        partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99])
+
+        custom_tune_config = tuning.TuningConfig(
+            config_set=config.StaticQuantConfig(
+                per_channel=[True, False],
+                execution_provider="CPUExecutionProvider",
+                quant_format=quantization.QuantFormat.QOperator,
+            )
+        )
+        best_model = tuning.autotune(
+            model_input=self.gptj,
+            tune_config=custom_tune_config,
+            eval_fn=partial_fake_eval,
+            calibration_data_reader=self.data_reader,
+        )
+
+        optypes = [i.op_type for i in best_model.graph.node]
+        self.assertTrue("QLinearMatMul" in optypes)
+        self.assertTrue("QuantizeLinear" in optypes)
+        ort.InferenceSession(best_model.SerializeToString(), providers=["CPUExecutionProvider"])
+        self.assertIsNotNone(best_model)
+
+    @mock.patch("onnx_neural_compressor.logger.warning")
+    def test_skip_verified_config_mapping(self, mock_warning):
+        partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99])
+
+        with self.assertRaises(SystemExit):
+            custom_tune_config = tuning.TuningConfig(
+                config_set=config.StaticQuantConfig(
+                    per_channel=[True, False],
+                    execution_provider="DmlExecutionProvider",
+                )
+            )
+            best_model = tuning.autotune(
+                model_input=self.gptj,
+                tune_config=custom_tune_config,
+                eval_fn=partial_fake_eval,
+                calibration_data_reader=self.data_reader,
+            )
+        call_args_list = mock_warning.call_args_list
+        # There may be multiple calls to warning, so we need to check all of them
+        self.assertIn("Skip the verified config mapping.", [info[0][0] for info in call_args_list])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/quantization/test_config.py b/test/quantization/test_config.py
index 50ffc74d0..39c09bbf0 100644
--- a/test/quantization/test_config.py
+++ b/test/quantization/test_config.py
@@ -7,8 +7,9 @@
 import onnx
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, logger, utility
+from onnx_neural_compressor import logger, quantization, utility
 from onnx_neural_compressor.quantization import algorithm_entry as algos
+from onnx_neural_compressor.quantization import config, tuning
 
 
 def find_onnx_file(folder_path):
@@ -83,6 +84,243 @@ def _count_woq_matmul(self, q_model, bits=4, group_size=32):
         ]
         return len(op_names)
 
+    def test_dynamic_quant_config(self):
+        for execution_provider in ["CPUExecutionProvider", "CUDAExecutionProvider", "DnnlExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.DynamicQuantConfig.get_config_set_for_tuning(
+                    execution_provider=execution_provider,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                if idx == 0:
+                    self.assertTrue(configs_mapping["Matmul"]["per_channel"])
+                elif idx == 1:
+                    self.assertFalse(configs_mapping["Matmul"]["per_channel"])
+                if 3 < idx < 8:
+                    self.assertTrue("LSTM" not in quant_config.op_types_to_quantize)
+                elif 7 < idx < 12:
+                    self.assertTrue("Conv" not in quant_config.op_types_to_quantize)
+                elif 11 < idx < 16:
+                    self.assertTrue("Attention" not in quant_config.op_types_to_quantize)
+                elif 15 < idx < 20:
+                    self.assertTrue("MatMul" not in quant_config.op_types_to_quantize)
+                self.assertLess(idx, 20)
+                self.assertTrue("add" not in configs_mapping and "add2" not in configs_mapping)
+
+        for execution_provider in ["DmlExecutionProvider", "TensorrtExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.DynamicQuantConfig.get_config_set_for_tuning(
+                    execution_provider=execution_provider,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                self.assertTrue("add" not in configs_mapping)
+                self.assertTrue("add2" not in configs_mapping)
+                self.assertTrue("Matmul" not in configs_mapping)
+
+            self.assertEqual(len(config_loader.config_set), 20)
+
+    def test_dynamic_custom_quant_config(self):
+        for execution_provider in ["CPUExecutionProvider", "CUDAExecutionProvider", "DnnlExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.DynamicQuantConfig(
+                    per_channel=[True, False],
+                    execution_provider=execution_provider,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                if idx == 0:
+                    self.assertTrue(configs_mapping["Matmul"]["per_channel"])
+                elif idx == 1:
+                    self.assertFalse(configs_mapping["Matmul"]["per_channel"])
+                self.assertLess(idx, 2)
+                self.assertTrue("add" not in configs_mapping and "add2" not in configs_mapping)
+
+        for execution_provider in ["DmlExecutionProvider", "TensorrtExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.DynamicQuantConfig(
+                    per_channel=[True, False],
+                    execution_provider=execution_provider,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                self.assertTrue("add" not in configs_mapping)
+                self.assertTrue("add2" not in configs_mapping)
+                self.assertTrue("Matmul" not in configs_mapping)
+                self.assertLess(idx, 4)
+
+            self.assertEqual(len(config_loader.config_set), 2)
+
+    def test_static_quant_config(self):
+        for execution_provider in ["CPUExecutionProvider", "CUDAExecutionProvider", "DnnlExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.StaticQuantConfig.get_config_set_for_tuning(
+                    execution_provider=execution_provider,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                if idx in [0, 4]:
+                    self.assertTrue(configs_mapping["Matmul"]["per_channel"])
+                elif idx in [1, 5]:
+                    self.assertFalse(configs_mapping["Matmul"]["per_channel"])
+                if idx < 4:
+                    self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax")
+                else:
+                    self.assertFalse("add" in configs_mapping)
+                if idx in [0, 1]:
+                    self.assertEqual(configs_mapping["Matmul"]["calibrate_method"], "MinMax")
+                self.assertLess(idx, 16)
+
+        for execution_provider in ["TensorrtExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.StaticQuantConfig.get_config_set_for_tuning(
+                    execution_provider=execution_provider,
+                    quant_format=quantization.QuantFormat.QOperator,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                self.assertTrue("add" not in configs_mapping)
+                self.assertTrue("add2" not in configs_mapping)
+                self.assertTrue("Matmul" not in configs_mapping)
+
+            self.assertEqual(len(config_loader.config_set), 16)
+
+        for execution_provider in ["DmlExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.StaticQuantConfig.get_config_set_for_tuning(
+                    execution_provider=execution_provider,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                if "Matmul" in configs_mapping:
+                    self.assertFalse(configs_mapping["Matmul"]["per_channel"])
+                    self.assertEqual(configs_mapping["Matmul"]["calibrate_method"], "MinMax")
+                if "add" in configs_mapping:
+                    self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax")
+                self.assertLess(idx, 16)
+
+        for execution_provider in ["TensorrtExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.StaticQuantConfig.get_config_set_for_tuning(
+                    execution_provider=execution_provider,
+                    quant_format=quantization.QuantFormat.QDQ,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                if idx in [0, 4]:
+                    self.assertTrue(configs_mapping["Matmul"]["per_channel"])
+                elif idx in [1, 5]:
+                    self.assertFalse(configs_mapping["Matmul"]["per_channel"])
+                if "add" in configs_mapping:
+                    self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax")
+                    self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax")
+                    self.assertTrue(configs_mapping["add"]["weight_sym"])
+                    self.assertTrue(configs_mapping["add"]["activation_sym"])
+                if "Matmul" in configs_mapping:
+                    self.assertTrue(configs_mapping["Matmul"]["weight_sym"])
+                    self.assertTrue(configs_mapping["Matmul"]["activation_sym"])
+                self.assertLess(idx, 16)
+
+    def test_static_custom_quant_config(self):
+        for execution_provider in ["CPUExecutionProvider", "CUDAExecutionProvider", "DnnlExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.StaticQuantConfig(
+                    per_channel=[True, False],
+                    execution_provider=execution_provider,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                if idx == 0:
+                    self.assertTrue(configs_mapping["Matmul"]["per_channel"])
+                elif idx == 1:
+                    self.assertFalse(configs_mapping["Matmul"]["per_channel"])
+                self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax")
+
+                self.assertLess(idx, 2)
+
+        for execution_provider in ["TensorrtExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.StaticQuantConfig(
+                    per_channel=[True, False],
+                    execution_provider=execution_provider,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                self.assertTrue("add" not in configs_mapping)
+                self.assertTrue("add2" not in configs_mapping)
+                self.assertTrue("Matmul" not in configs_mapping)
+
+            # only 1 config without op level quant config
+            self.assertEqual(len(config_loader.config_set), 2)
+
+        for execution_provider in ["DmlExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.StaticQuantConfig(
+                    per_channel=[True, False],
+                    execution_provider=execution_provider,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                self.assertFalse(configs_mapping["Matmul"]["per_channel"])
+                self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax")
+                self.assertLess(idx, 4)
+
+        for execution_provider in ["TensorrtExecutionProvider"]:
+            tuning_config = tuning.TuningConfig(
+                config_set=config.StaticQuantConfig(
+                    per_channel=[True, False],
+                    execution_provider=execution_provider,
+                    quant_format=quantization.QuantFormat.QDQ,
+                )
+            )
+            config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
+            for idx, quant_config in enumerate(config_loader):
+                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
+                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                if idx == 0:
+                    self.assertTrue(configs_mapping["Matmul"]["per_channel"])
+                elif idx == 1:
+                    self.assertFalse(configs_mapping["Matmul"]["per_channel"])
+                self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax")
+                self.assertTrue(configs_mapping["add"]["weight_sym"])
+                self.assertTrue(configs_mapping["add"]["activation_sym"])
+                self.assertTrue(configs_mapping["Matmul"]["weight_sym"])
+                self.assertTrue(configs_mapping["Matmul"]["activation_sym"])
+                self.assertLess(idx, 2)
+
     def test_config_white_lst(self):
         global_config = config.RTNConfig(weight_bits=4)
         # set operator instance
@@ -113,12 +351,12 @@ def test_config_white_lst3(self):
         quant_config = global_config + fc_out_config
         # get model and quantize
         fp32_model = self.gptj
-        model_info = utility.get_model_info(fp32_model, white_op_type_list=["MatMul"])
+        model_info = config.RTNConfig.get_model_info(fp32_model)
         logger.info(quant_config)
         configs_mapping = quant_config.to_config_mapping(model_info=model_info)
         logger.info(configs_mapping)
-        self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 8)
-        self.assertTrue(configs_mapping[("/h.4/mlp/fc_in/MatMul", "MatMul")].weight_bits == 4)
+        self.assertTrue(configs_mapping["/h.4/mlp/fc_out/MatMul"]["weight_bits"] == 8)
+        self.assertTrue(configs_mapping["/h.4/mlp/fc_in/MatMul"]["weight_bits"] == 4)
 
     def test_config_from_dict(self):
         quant_config = {
@@ -170,6 +408,7 @@ def test_same_type_configs_addition(self):
                 },
             }
         }
+
         q_config2 = config.RTNConfig.from_dict(quant_config2["rtn"])
         q_config3 = q_config + q_config2
         q3_dict = q_config3.to_dict()
@@ -185,21 +424,21 @@ def test_config_mapping(self):
         quant_config.set_local("/h.4/mlp/fc_out/MatMul", fc_out_config)
         # get model and quantize
         fp32_model = self.gptj
-        model_info = utility.get_model_info(fp32_model, white_op_type_list=["MatMul"])
+        model_info = config.RTNConfig.get_model_info(fp32_model)
         logger.info(quant_config)
         configs_mapping = quant_config.to_config_mapping(model_info=model_info)
         logger.info(configs_mapping)
-        self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 8)
-        self.assertTrue(configs_mapping[("/h.4/mlp/fc_in/MatMul", "MatMul")].weight_bits == 4)
+        self.assertTrue(configs_mapping["/h.4/mlp/fc_out/MatMul"]["weight_bits"] == 8)
+        self.assertTrue(configs_mapping["/h.4/mlp/fc_in/MatMul"]["weight_bits"] == 4)
         # test regular matching
         fc_config = config.RTNConfig(weight_bits=3)
         quant_config.set_local("/h.[1-4]/mlp/fc_out/MatMul", fc_config)
         configs_mapping = quant_config.to_config_mapping(model_info=model_info)
         logger.info(configs_mapping)
-        self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3)
-        self.assertTrue(configs_mapping[("/h.3/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3)
-        self.assertTrue(configs_mapping[("/h.2/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3)
-        self.assertTrue(configs_mapping[("/h.1/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3)
+        self.assertTrue(configs_mapping["/h.4/mlp/fc_out/MatMul"]["weight_bits"] == 3)
+        self.assertTrue(configs_mapping["/h.3/mlp/fc_out/MatMul"]["weight_bits"] == 3)
+        self.assertTrue(configs_mapping["/h.2/mlp/fc_out/MatMul"]["weight_bits"] == 3)
+        self.assertTrue(configs_mapping["/h.1/mlp/fc_out/MatMul"]["weight_bits"] == 3)
 
     def test_diff_types_configs_addition(self):
         quant_config1 = {
@@ -219,12 +458,12 @@ def test_diff_types_configs_addition(self):
 
 class TestQuantConfigForAutotune(unittest.TestCase):
 
-    def test_expand_config(self):
+    def test_expand_woq_config(self):
         # test the expand functionalities, the user is not aware it
         tune_config = config.RTNConfig(weight_bits=[4, 8])
         expand_config_list = config.RTNConfig.expand(tune_config)
-        self.assertEqual(expand_config_list[0].weight_bits, 4)
-        self.assertEqual(expand_config_list[1].weight_bits, 8)
+        self.assertEqual(expand_config_list[0]["weight_bits"], 4)
+        self.assertEqual(expand_config_list[1]["weight_bits"], 8)
 
 
 if __name__ == "__main__":
diff --git a/test/quantization/test_smooth_quant.py b/test/quantization/test_smooth_quant.py
index fed59e142..242417e6e 100644
--- a/test/quantization/test_smooth_quant.py
+++ b/test/quantization/test_smooth_quant.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-#
 # Copyright (c) 2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,12 +19,13 @@
 
 import numpy as np
 import onnx
+import onnxruntime as ort
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, data_reader
+from onnx_neural_compressor import data_reader
 from onnx_neural_compressor.quantization import QuantType
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import quantize
+from onnx_neural_compressor.quantization import config, quantize
 
 
 class DataReader(data_reader.CalibrationDataReader):
@@ -72,6 +71,7 @@ def setUpClass(self):
     @classmethod
     def tearDownClass(self):
         shutil.rmtree("./gptj", ignore_errors=True)
+        os.remove("Optimized_model.onnx")
 
     def test_sq_from_class_beginner(self):
         self.data_reader.rewind()
@@ -111,6 +111,43 @@ def test_sq_with_ort_like_api(self):
         self.assertTrue(3 not in [i.data_type for i in model.graph.initializer])
         self.assertEqual(num_muls, 30)
 
+    def test_smooth_quant_args(self):
+        self.data_reader.rewind()
+        sq_config = config.SmoothQuantConfig(
+            weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, alpha="auto"
+        )
+        model = algos.smooth_quant_entry(self.gptj, sq_config, self.data_reader)
+        num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"])
+        self.assertEqual(num_muls, 30)
+
+        self.data_reader.rewind()
+        sq_config = config.SmoothQuantConfig(
+            weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, scales_per_op=False
+        )
+        model = algos.smooth_quant_entry(self.gptj, sq_config, self.data_reader)
+        num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"])
+        self.assertEqual(num_muls, 15)
+
+        sess_options = ort.SessionOptions()
+        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+        sess_options.optimized_model_filepath = "Optimized_model.onnx"
+        sess = ort.InferenceSession(self.gptj, sess_options, providers=["CPUExecutionProvider"])
+        self.data_reader.rewind()
+        sq_config = config.SmoothQuantConfig(
+            weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, folding=True, scales_per_op=False
+        )
+        model = algos.smooth_quant_entry("Optimized_model.onnx", sq_config, self.data_reader)
+        num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"])
+        self.assertEqual(num_muls, 10)
+
+        self.data_reader.rewind()
+        sq_config = config.SmoothQuantConfig(
+            weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, folding=False, scales_per_op=False
+        )
+        model = algos.smooth_quant_entry("Optimized_model.onnx", sq_config, self.data_reader)
+        num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"])
+        self.assertEqual(num_muls, 15)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/quantization/weight_only/test_awq.py b/test/quantization/weight_only/test_awq.py
index 2d918cc61..e1c23d495 100644
--- a/test/quantization/weight_only/test_awq.py
+++ b/test/quantization/weight_only/test_awq.py
@@ -8,9 +8,9 @@
 import transformers
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, data_reader, logger
+from onnx_neural_compressor import data_reader, logger
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer
+from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer
 
 
 def find_onnx_file(folder_path):
diff --git a/test/quantization/weight_only/test_gptq.py b/test/quantization/weight_only/test_gptq.py
index 133e11fd1..1e674b7dd 100644
--- a/test/quantization/weight_only/test_gptq.py
+++ b/test/quantization/weight_only/test_gptq.py
@@ -8,9 +8,9 @@
 import transformers
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, data_reader, logger
+from onnx_neural_compressor import data_reader, logger
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer
+from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer
 
 
 def find_onnx_file(folder_path):
diff --git a/test/quantization/weight_only/test_rtn.py b/test/quantization/weight_only/test_rtn.py
index 86b3c49a3..aa3672d0c 100644
--- a/test/quantization/weight_only/test_rtn.py
+++ b/test/quantization/weight_only/test_rtn.py
@@ -6,9 +6,9 @@
 
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, logger
+from onnx_neural_compressor import logger
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer
+from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer
 
 
 def find_onnx_file(folder_path):
diff --git a/test/utils/test_general.py b/test/utils/test_general.py
index d24392438..b07d73115 100644
--- a/test/utils/test_general.py
+++ b/test/utils/test_general.py
@@ -2,8 +2,8 @@
 
 import unittest
 
-from onnx_neural_compressor import config, constants, logger
-from onnx_neural_compressor.quantization import tuning
+from onnx_neural_compressor import constants, logger
+from onnx_neural_compressor.quantization import config, tuning
 
 from typing import Any, Callable, List, Optional, Tuple, Union  # isort: skip
 
@@ -192,7 +192,10 @@ def test_api(self):
         self.assertEqual(fake_default_config.weight_dtype, "int")
         config_set = get_all_config_set()
         self.assertEqual(len(config_set), len(config.config_registry.get_all_config_cls()))
-        self.assertEqual([i for i in config_set if i.name == FAKE_CONFIG_NAME][0].weight_bits, DEFAULT_WEIGHT_BITS)
+        self.assertEqual(
+            [i for i in config_set if getattr(i, "name", "None") == FAKE_CONFIG_NAME][0].weight_bits,
+            DEFAULT_WEIGHT_BITS,
+        )
 
     def test_config_expand_complex_tunable_type(self):
         target_op_type_list_options = [["Conv", "Gemm"], ["Conv", "Matmul"]]
@@ -211,8 +214,98 @@ def test_mixed_two_algos(self):
         mixed_config = fake_config + fake1_config
         model_info = mixed_config.get_model_info(model)
         config_mapping = mixed_config.to_config_mapping(model_info=model_info)
-        self.assertIn(OP1_NAME, [op_info[0] for op_info in config_mapping])
-        self.assertIn(OP2_NAME, [op_info[0] for op_info in config_mapping])
+        self.assertIn(OP1_NAME, config_mapping)
+        self.assertIn(OP2_NAME, config_mapping)
+
+    def test_config_expand(self) -> None:
+        cfg = config.RTNConfig(
+            weight_bits=[4, 8], weight_sym=[True, False], layer_wise_quant=[True, False], providers=[["CPU"], ["CUDA"]]
+        )
+        expand_cfgs = cfg.expand()
+        self.assertEqual(expand_cfgs[0].weight_bits, 4)
+        self.assertEqual(expand_cfgs[0].weight_sym, True)
+        self.assertEqual(expand_cfgs[0].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[0].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[1].weight_bits, 8)
+        self.assertEqual(expand_cfgs[1].weight_sym, True)
+        self.assertEqual(expand_cfgs[1].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[1].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[2].weight_bits, 4)
+        self.assertEqual(expand_cfgs[2].weight_sym, False)
+        self.assertEqual(expand_cfgs[2].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[2].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[3].weight_bits, 8)
+        self.assertEqual(expand_cfgs[3].weight_sym, False)
+        self.assertEqual(expand_cfgs[3].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[3].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[4].weight_bits, 4)
+        self.assertEqual(expand_cfgs[4].weight_sym, True)
+        self.assertEqual(expand_cfgs[4].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[4].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[5].weight_bits, 8)
+        self.assertEqual(expand_cfgs[5].weight_sym, True)
+        self.assertEqual(expand_cfgs[5].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[5].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[6].weight_bits, 4)
+        self.assertEqual(expand_cfgs[6].weight_sym, False)
+        self.assertEqual(expand_cfgs[6].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[6].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[7].weight_bits, 8)
+        self.assertEqual(expand_cfgs[7].weight_sym, False)
+        self.assertEqual(expand_cfgs[7].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[7].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[8].weight_bits, 4)
+        self.assertEqual(expand_cfgs[8].weight_sym, True)
+        self.assertEqual(expand_cfgs[8].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[8].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[9].weight_bits, 8)
+        self.assertEqual(expand_cfgs[9].weight_sym, True)
+        self.assertEqual(expand_cfgs[9].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[9].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[10].weight_bits, 4)
+        self.assertEqual(expand_cfgs[10].weight_sym, False)
+        self.assertEqual(expand_cfgs[10].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[10].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[11].weight_bits, 8)
+        self.assertEqual(expand_cfgs[11].weight_sym, False)
+        self.assertEqual(expand_cfgs[11].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[11].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[12].weight_bits, 4)
+        self.assertEqual(expand_cfgs[12].weight_sym, True)
+        self.assertEqual(expand_cfgs[12].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[12].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[13].weight_bits, 8)
+        self.assertEqual(expand_cfgs[13].weight_sym, True)
+        self.assertEqual(expand_cfgs[13].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[13].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[14].weight_bits, 4)
+        self.assertEqual(expand_cfgs[14].weight_sym, False)
+        self.assertEqual(expand_cfgs[14].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[14].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[15].weight_bits, 8)
+        self.assertEqual(expand_cfgs[15].weight_sym, False)
+        self.assertEqual(expand_cfgs[15].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[15].providers, ["CUDA"])
+
+    def test_config_expand_with_empty_options(self):
+        configs = FakeAlgoConfig(weight_dtype=["int", "float32"], weight_bits=[])
+        configs_list = configs.expand()
+        self.assertEqual(len(configs_list), 2)
 
 
 class TestConfigSet(unittest.TestCase):
@@ -247,6 +340,14 @@ def test_config_loader(self) -> None:
         for i, cfg in enumerate(self.loader):
             self.assertEqual(cfg, self.config_set[i])
 
+    def test_config_loader_skip_verified_config(self) -> None:
+        config_set = [FakeAlgoConfig(weight_bits=[4, 8]), FakeAlgoConfig(weight_bits=8)]
+        config_loader = tuning.ConfigLoader(config_set)
+        config_count = 0
+        for i, _ in enumerate(config_loader):
+            config_count += 1
+        self.assertEqual(config_count, 2)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/utils/test_param.py b/test/utils/test_param.py
index fd8b7d3d3..549e1fb47 100644
--- a/test/utils/test_param.py
+++ b/test/utils/test_param.py
@@ -3,7 +3,7 @@
 import unittest
 from typing import List
 
-from onnx_neural_compressor import config
+from onnx_neural_compressor.quantization import config
 
 
 class TestTuningParam(unittest.TestCase):
@@ -20,6 +20,9 @@ def test_is_tunable_recursive(self):
         self.assertTrue(param.is_tunable([[5, 6], [7, 8]]))
         # TODO: double check if this is the expected behavior
         self.assertTrue(param.is_tunable([[5, 6], [7, "8"]]))
+        self.assertEqual(
+            str(param), "TuningParam(name=param_name, tunable_type=typing.List[typing.List[int]], options=None)."
+        )
 
 
 if __name__ == "__main__":
diff --git a/test/utils/test_utility.py b/test/utils/test_utility.py
index fa7a4812f..50ce620b9 100644
--- a/test/utils/test_utility.py
+++ b/test/utils/test_utility.py
@@ -17,26 +17,6 @@ def test_set_random_seed(self):
         with self.assertRaises(AssertionError):
             utility.set_random_seed(seed)
 
-    def test_set_workspace(self):
-        workspace = "/path/to/workspace"
-        utility.set_workspace(workspace)
-        self.assertEqual(utility.options.workspace, workspace)
-
-        # non String type
-        workspace = 12345
-        with self.assertRaises(AssertionError):
-            utility.set_workspace(workspace)
-
-    def test_set_resume_from(self):
-        resume_from = "/path/to/resume"
-        utility.set_resume_from(resume_from)
-        self.assertEqual(utility.options.resume_from, resume_from)
-
-        # non String type
-        resume_from = 12345
-        with self.assertRaises(AssertionError):
-            utility.set_resume_from(resume_from)
-
 
 class TestCPUInfo(unittest.TestCase):