Skip to content

Commit

Permalink
fix ut
Browse files Browse the repository at this point in the history
Signed-off-by: yuwenzho <[email protected]>
  • Loading branch information
yuwenzho committed Jul 2, 2024
1 parent 3b4f716 commit 2340f61
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 103 deletions.
6 changes: 3 additions & 3 deletions onnx_neural_compressor/algorithms/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,12 +174,12 @@ def quantize_data_per_channel(data, axis, qType, sym, reduce_range=False):
return rmin.reshape(-1, 1), rmax.reshape(-1, 1), zero_point.reshape(-1, 1), scale.reshape(-1, 1), quantized_data


def dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value): # pragma: no cover
def dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value):
"""Dequantize tensor with scale and zero point."""
return (tensor_value.astype(scale_value.dtype) - zo_value.astype(scale_value.dtype)) * scale_value


def dequantize_data(tensor_value, scale_value, zo_value, axis=0): # pragma: no cover
def dequantize_data(tensor_value, scale_value, zo_value, axis=0):
"""Dequantize tensor."""
if not isinstance(scale_value, np.ndarray):
return dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value)
Expand Down Expand Up @@ -386,7 +386,7 @@ def make_matmul_weight_only_node(
# require onnxruntime > 1.16.3
kwargs["accuracy_level"] = accuracy_level

else:
else: # pragma: no cover
offset = 5 if zero_point is not None else 4
op_type = "MatMulFpQ4"

Expand Down
26 changes: 19 additions & 7 deletions test/quantization/layer_wise/test_layer_wise.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
import transformers
from optimum.exporters.onnx import main_export

from onnx_neural_compressor import data_reader, logger
from onnx_neural_compressor import data_reader, logger, onnx_model
from onnx_neural_compressor.quantization import algorithm_entry as algos
from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer



def find_onnx_file(folder_path):
# return first .onnx file path in folder_path
for root, dirs, files in os.walk(folder_path):
Expand Down Expand Up @@ -64,6 +65,7 @@ def setUpClass(self):
llama_id = "yujiepan/llama-2-tiny-3layers-random"
main_export(llama_id, output="llama-2-tiny-3layers-random", task="text-generation")
model_path = find_onnx_file("llama-2-tiny-3layers-random")
self.llama = model_path

model = onnx.load(model_path)
model = symbolic_shape_infer.SymbolicShapeInference.infer_shapes(model, auto_merge=True)
Expand All @@ -75,7 +77,7 @@ def setUpClass(self):
sess_options.optimized_model_filepath = "llama-2-tiny-3layers-random/optimized_model.onnx"
ort.InferenceSession(infer_shape_model_path, sess_options)

self.llama = "llama-2-tiny-3layers-random/optimized_model.onnx"
self.llama_optimized = "llama-2-tiny-3layers-random/optimized_model.onnx"
self.calibration_data_reader = DummyNLPDataloader(llama_id)

@classmethod
Expand Down Expand Up @@ -105,7 +107,7 @@ def _get_quantized_matmul_weight(self, model, matmul_name):
return weight_init

def _apply_quantize(self, quant_config, quant_func, data_reader=None):
fp32_model = copy.deepcopy(self.llama)
fp32_model = copy.deepcopy(self.llama_optimized)
if data_reader is None:
qmodel = quant_func(fp32_model, quant_config)
else:
Expand All @@ -132,7 +134,7 @@ def test_rtn_layer_wise_with_ort_like_api(self):
# get qmodel without layer_wise_quant
algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=False)
quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(
copy.deepcopy(self.llama),
copy.deepcopy(self.llama_optimized),
algo_config=algo_config,
optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
)
Expand All @@ -144,7 +146,7 @@ def test_rtn_layer_wise_with_ort_like_api(self):
# get qmodel with layer_wise_quant
algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=True)
quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(
copy.deepcopy(self.llama),
copy.deepcopy(self.llama_optimized),
algo_config=algo_config,
optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
)
Expand Down Expand Up @@ -183,7 +185,7 @@ def test_gptq_layer_wise_with_ort_like_api(self):
layer_wise_quant=False, calibration_data_reader=self.calibration_data_reader
)
quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(
copy.deepcopy(self.llama),
copy.deepcopy(self.llama_optimized),
algo_config=algo_config,
optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
)
Expand All @@ -197,7 +199,7 @@ def test_gptq_layer_wise_with_ort_like_api(self):
layer_wise_quant=True, calibration_data_reader=self.calibration_data_reader
)
quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(
copy.deepcopy(self.llama),
copy.deepcopy(self.llama_optimized),
algo_config=algo_config,
optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
)
Expand All @@ -213,6 +215,16 @@ def test_gptq_layer_wise_with_ort_like_api(self):
self.assertIsNotNone(quantized_weight)
self.assertTrue((lwq_quantized_weight == quantized_weight).all())

def test__check_model_with_infer_shapes(self):
from onnx_neural_compressor.algorithms.layer_wise import core as lwq_core
self.assertFalse(lwq_core._check_model_with_infer_shapes(self.llama))
self.assertTrue(lwq_core._check_model_with_infer_shapes(self.llama_optimized))
self.assertTrue(
lwq_core._check_model_with_infer_shapes(
onnx_model.ONNXModel(onnx.load(self.llama_optimized, load_external_data=False))
)
)


if __name__ == "__main__":
unittest.main()
94 changes: 1 addition & 93 deletions test/quantization/test_algorithm_utility.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
"""Tests for algorithm utility components."""

Check warning

Code scanning / lintrunner

BLACK-ISORT/format Warning test

Run lintrunner -a to apply this patch.

import os
import shutil
import onnx
import unittest

import numpy as np
import onnx
import onnxruntime
import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer
import optimum.exporters.onnx

from onnx_neural_compressor import onnx_model
from onnx_neural_compressor.algorithms import utility as quant_utils


Expand All @@ -22,29 +16,7 @@ def find_onnx_file(folder_path):
return os.path.join(root, file)
return None


class TestUtilityFunctions(unittest.TestCase):
@classmethod
def setUpClass(self):
llama_id = "yujiepan/llama-2-tiny-3layers-random"
optimum.exporters.onnx.main_export(llama_id, output="llama-2-tiny-3layers-random", task="text-generation")
model_path = find_onnx_file("llama-2-tiny-3layers-random")
self.llama = model_path

model = onnx.load(model_path)
model = symbolic_shape_infer.SymbolicShapeInference.infer_shapes(model, auto_merge=True)
infer_shape_model_path = "llama-2-tiny-3layers-random/model-infer-shape.onnx"
onnx.save(model, infer_shape_model_path)
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
sess_options.optimized_model_filepath = "llama-2-tiny-3layers-random/optimized_model.onnx"
onnxruntime.InferenceSession(infer_shape_model_path, sess_options)

self.llama_optimized = "llama-2-tiny-3layers-random/optimized_model.onnx"

@classmethod
def tearDownClass(self):
shutil.rmtree("llama-2-tiny-3layers-random", ignore_errors=True)

def test_is_B_transposed(self):
node = onnx.helper.make_node(
Expand All @@ -66,67 +38,3 @@ def test_is_B_transposed(self):
beta=0.35,
)
self.assertFalse(quant_utils.is_B_transposed(node))

def test_quantize_data(self):
# sym int8
data = [1, 2, 3, 4, 5]
quantize_range = 127
qType = onnx.onnx_pb.TensorProto.INT8
scheme = "sym"
rmin, rmax, zero_point, scale, quantized_data = quant_utils.quantize_data(data, quantize_range, qType, scheme)
self.assertEqual(quantized_data.dtype, np.int8)

scale, zero_point = quant_utils._calculate_scale_zp(np.array([0]), np.array([5]), quantize_range, qType, scheme)
self.assertEqual(zero_point.dtype, np.int8)

scale, zero_point = quant_utils._calculate_scale_zp(
np.array([0]), np.array([127]), quantize_range, qType, scheme
)
self.assertEqual(zero_point.dtype, np.int8)

# asym uint8
data = [-1, 0, 1, 2, 3]
quantize_range = 255
qType = onnx.onnx_pb.TensorProto.UINT8
scheme = "asym"
rmin, rmax, zero_point, scale, quantized_data = quant_utils.quantize_data(data, quantize_range, qType, scheme)
self.assertEqual(quantized_data.dtype, np.uint8)

scale, zero_point = quant_utils._calculate_scale_zp(np.array([0]), np.array([5]), quantize_range, qType, scheme)
self.assertEqual(zero_point.dtype, np.uint8)

scale, zero_point = quant_utils._calculate_scale_zp(
np.array([0]), np.array([255]), quantize_range, qType, scheme
)
self.assertEqual(zero_point.dtype, np.uint8)

# unexpected combination
with self.assertRaises(ValueError) as cm:
rmin, rmax, zero_point, scale, quantized_data = quant_utils.quantize_data(
data, quantize_range, qType=onnx.onnx_pb.TensorProto.UINT8, scheme="sym"
)
self.assertTrue("Unexpected combination of data type" in str(cm.exception))

def test_get_qrange_for_qType(self):
qrange = quant_utils.get_qrange_for_qType(qType=onnx.onnx_pb.TensorProto.UINT8)
self.assertEqual(qrange, 255)
qrange = quant_utils.get_qrange_for_qType(qType=onnx.onnx_pb.TensorProto.UINT8, reduce_range=True)
self.assertEqual(qrange, 127)
qrange = quant_utils.get_qrange_for_qType(qType=onnx.onnx_pb.TensorProto.INT8)
self.assertEqual(qrange, 254)
qrange = quant_utils.get_qrange_for_qType(qType=onnx.onnx_pb.TensorProto.INT8, reduce_range=True)
self.assertEqual(qrange, 128)

# unexpected quantization data type
with self.assertRaises(ValueError) as cm:
quant_utils.get_qrange_for_qType(qType=onnx.onnx_pb.TensorProto.FLOAT)
self.assertEqual(str(cm.exception), "unsupported quantization data type")

def test_check_model_with_infer_shapes(self):
self.assertFalse(quant_utils.check_model_with_infer_shapes(self.llama))
self.assertTrue(quant_utils.check_model_with_infer_shapes(self.llama_optimized))
self.assertTrue(
quant_utils.check_model_with_infer_shapes(
onnx_model.ONNXModel(onnx.load(self.llama_optimized, load_external_data=False))
)
)

0 comments on commit 2340f61

Please sign in to comment.