From 2340f613d3140e2df23493e0980228f3cbe20826 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Tue, 2 Jul 2024 16:52:11 +0800 Subject: [PATCH] fix ut Signed-off-by: yuwenzho --- onnx_neural_compressor/algorithms/utility.py | 6 +- .../layer_wise/test_layer_wise.py | 26 +++-- test/quantization/test_algorithm_utility.py | 94 +------------------ 3 files changed, 23 insertions(+), 103 deletions(-) diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py index d802dc04d..e8ff842f8 100644 --- a/onnx_neural_compressor/algorithms/utility.py +++ b/onnx_neural_compressor/algorithms/utility.py @@ -174,12 +174,12 @@ def quantize_data_per_channel(data, axis, qType, sym, reduce_range=False): return rmin.reshape(-1, 1), rmax.reshape(-1, 1), zero_point.reshape(-1, 1), scale.reshape(-1, 1), quantized_data -def dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value): # pragma: no cover +def dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value): """Dequantize tensor with scale and zero point.""" return (tensor_value.astype(scale_value.dtype) - zo_value.astype(scale_value.dtype)) * scale_value -def dequantize_data(tensor_value, scale_value, zo_value, axis=0): # pragma: no cover +def dequantize_data(tensor_value, scale_value, zo_value, axis=0): """Dequantize tensor.""" if not isinstance(scale_value, np.ndarray): return dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value) @@ -386,7 +386,7 @@ def make_matmul_weight_only_node( # require onnxruntime > 1.16.3 kwargs["accuracy_level"] = accuracy_level - else: + else: # pragma: no cover offset = 5 if zero_point is not None else 4 op_type = "MatMulFpQ4" diff --git a/test/quantization/layer_wise/test_layer_wise.py b/test/quantization/layer_wise/test_layer_wise.py index 7988cd3f6..6f0a3632a 100644 --- a/test/quantization/layer_wise/test_layer_wise.py +++ b/test/quantization/layer_wise/test_layer_wise.py @@ -10,11 +10,12 @@ import transformers from optimum.exporters.onnx import main_export -from onnx_neural_compressor import data_reader, logger +from onnx_neural_compressor import data_reader, logger, onnx_model from onnx_neural_compressor.quantization import algorithm_entry as algos from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer + def find_onnx_file(folder_path): # return first .onnx file path in folder_path for root, dirs, files in os.walk(folder_path): @@ -64,6 +65,7 @@ def setUpClass(self): llama_id = "yujiepan/llama-2-tiny-3layers-random" main_export(llama_id, output="llama-2-tiny-3layers-random", task="text-generation") model_path = find_onnx_file("llama-2-tiny-3layers-random") + self.llama = model_path model = onnx.load(model_path) model = symbolic_shape_infer.SymbolicShapeInference.infer_shapes(model, auto_merge=True) @@ -75,7 +77,7 @@ def setUpClass(self): sess_options.optimized_model_filepath = "llama-2-tiny-3layers-random/optimized_model.onnx" ort.InferenceSession(infer_shape_model_path, sess_options) - self.llama = "llama-2-tiny-3layers-random/optimized_model.onnx" + self.llama_optimized = "llama-2-tiny-3layers-random/optimized_model.onnx" self.calibration_data_reader = DummyNLPDataloader(llama_id) @classmethod @@ -105,7 +107,7 @@ def _get_quantized_matmul_weight(self, model, matmul_name): return weight_init def _apply_quantize(self, quant_config, quant_func, data_reader=None): - fp32_model = copy.deepcopy(self.llama) + fp32_model = copy.deepcopy(self.llama_optimized) if data_reader is None: qmodel = quant_func(fp32_model, quant_config) else: @@ -132,7 +134,7 @@ def test_rtn_layer_wise_with_ort_like_api(self): # get qmodel without layer_wise_quant algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=False) quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( - copy.deepcopy(self.llama), + copy.deepcopy(self.llama_optimized), algo_config=algo_config, optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) @@ -144,7 +146,7 @@ def test_rtn_layer_wise_with_ort_like_api(self): # get qmodel with layer_wise_quant algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=True) quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( - copy.deepcopy(self.llama), + copy.deepcopy(self.llama_optimized), algo_config=algo_config, optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) @@ -183,7 +185,7 @@ def test_gptq_layer_wise_with_ort_like_api(self): layer_wise_quant=False, calibration_data_reader=self.calibration_data_reader ) quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( - copy.deepcopy(self.llama), + copy.deepcopy(self.llama_optimized), algo_config=algo_config, optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) @@ -197,7 +199,7 @@ def test_gptq_layer_wise_with_ort_like_api(self): layer_wise_quant=True, calibration_data_reader=self.calibration_data_reader ) quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( - copy.deepcopy(self.llama), + copy.deepcopy(self.llama_optimized), algo_config=algo_config, optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) @@ -213,6 +215,16 @@ def test_gptq_layer_wise_with_ort_like_api(self): self.assertIsNotNone(quantized_weight) self.assertTrue((lwq_quantized_weight == quantized_weight).all()) + def test__check_model_with_infer_shapes(self): + from onnx_neural_compressor.algorithms.layer_wise import core as lwq_core + self.assertFalse(lwq_core._check_model_with_infer_shapes(self.llama)) + self.assertTrue(lwq_core._check_model_with_infer_shapes(self.llama_optimized)) + self.assertTrue( + lwq_core._check_model_with_infer_shapes( + onnx_model.ONNXModel(onnx.load(self.llama_optimized, load_external_data=False)) + ) + ) + if __name__ == "__main__": unittest.main() diff --git a/test/quantization/test_algorithm_utility.py b/test/quantization/test_algorithm_utility.py index 28a8f2a7a..630f4859b 100644 --- a/test/quantization/test_algorithm_utility.py +++ b/test/quantization/test_algorithm_utility.py @@ -1,16 +1,10 @@ """Tests for algorithm utility components.""" import os -import shutil +import onnx import unittest - import numpy as np -import onnx -import onnxruntime -import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer -import optimum.exporters.onnx -from onnx_neural_compressor import onnx_model from onnx_neural_compressor.algorithms import utility as quant_utils @@ -22,29 +16,7 @@ def find_onnx_file(folder_path): return os.path.join(root, file) return None - class TestUtilityFunctions(unittest.TestCase): - @classmethod - def setUpClass(self): - llama_id = "yujiepan/llama-2-tiny-3layers-random" - optimum.exporters.onnx.main_export(llama_id, output="llama-2-tiny-3layers-random", task="text-generation") - model_path = find_onnx_file("llama-2-tiny-3layers-random") - self.llama = model_path - - model = onnx.load(model_path) - model = symbolic_shape_infer.SymbolicShapeInference.infer_shapes(model, auto_merge=True) - infer_shape_model_path = "llama-2-tiny-3layers-random/model-infer-shape.onnx" - onnx.save(model, infer_shape_model_path) - sess_options = onnxruntime.SessionOptions() - sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED - sess_options.optimized_model_filepath = "llama-2-tiny-3layers-random/optimized_model.onnx" - onnxruntime.InferenceSession(infer_shape_model_path, sess_options) - - self.llama_optimized = "llama-2-tiny-3layers-random/optimized_model.onnx" - - @classmethod - def tearDownClass(self): - shutil.rmtree("llama-2-tiny-3layers-random", ignore_errors=True) def test_is_B_transposed(self): node = onnx.helper.make_node( @@ -66,67 +38,3 @@ def test_is_B_transposed(self): beta=0.35, ) self.assertFalse(quant_utils.is_B_transposed(node)) - - def test_quantize_data(self): - # sym int8 - data = [1, 2, 3, 4, 5] - quantize_range = 127 - qType = onnx.onnx_pb.TensorProto.INT8 - scheme = "sym" - rmin, rmax, zero_point, scale, quantized_data = quant_utils.quantize_data(data, quantize_range, qType, scheme) - self.assertEqual(quantized_data.dtype, np.int8) - - scale, zero_point = quant_utils._calculate_scale_zp(np.array([0]), np.array([5]), quantize_range, qType, scheme) - self.assertEqual(zero_point.dtype, np.int8) - - scale, zero_point = quant_utils._calculate_scale_zp( - np.array([0]), np.array([127]), quantize_range, qType, scheme - ) - self.assertEqual(zero_point.dtype, np.int8) - - # asym uint8 - data = [-1, 0, 1, 2, 3] - quantize_range = 255 - qType = onnx.onnx_pb.TensorProto.UINT8 - scheme = "asym" - rmin, rmax, zero_point, scale, quantized_data = quant_utils.quantize_data(data, quantize_range, qType, scheme) - self.assertEqual(quantized_data.dtype, np.uint8) - - scale, zero_point = quant_utils._calculate_scale_zp(np.array([0]), np.array([5]), quantize_range, qType, scheme) - self.assertEqual(zero_point.dtype, np.uint8) - - scale, zero_point = quant_utils._calculate_scale_zp( - np.array([0]), np.array([255]), quantize_range, qType, scheme - ) - self.assertEqual(zero_point.dtype, np.uint8) - - # unexpected combination - with self.assertRaises(ValueError) as cm: - rmin, rmax, zero_point, scale, quantized_data = quant_utils.quantize_data( - data, quantize_range, qType=onnx.onnx_pb.TensorProto.UINT8, scheme="sym" - ) - self.assertTrue("Unexpected combination of data type" in str(cm.exception)) - - def test_get_qrange_for_qType(self): - qrange = quant_utils.get_qrange_for_qType(qType=onnx.onnx_pb.TensorProto.UINT8) - self.assertEqual(qrange, 255) - qrange = quant_utils.get_qrange_for_qType(qType=onnx.onnx_pb.TensorProto.UINT8, reduce_range=True) - self.assertEqual(qrange, 127) - qrange = quant_utils.get_qrange_for_qType(qType=onnx.onnx_pb.TensorProto.INT8) - self.assertEqual(qrange, 254) - qrange = quant_utils.get_qrange_for_qType(qType=onnx.onnx_pb.TensorProto.INT8, reduce_range=True) - self.assertEqual(qrange, 128) - - # unexpected quantization data type - with self.assertRaises(ValueError) as cm: - quant_utils.get_qrange_for_qType(qType=onnx.onnx_pb.TensorProto.FLOAT) - self.assertEqual(str(cm.exception), "unsupported quantization data type") - - def test_check_model_with_infer_shapes(self): - self.assertFalse(quant_utils.check_model_with_infer_shapes(self.llama)) - self.assertTrue(quant_utils.check_model_with_infer_shapes(self.llama_optimized)) - self.assertTrue( - quant_utils.check_model_with_infer_shapes( - onnx_model.ONNXModel(onnx.load(self.llama_optimized, load_external_data=False)) - ) - )