diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 000000000..9839352d0 --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,84 @@ +# Copyright (c) ONNX Neural Compressor Project Contributors +# +# SPDX-License-Identifier: Apache-2.0 + +name: Lint + +on: + push: + branches: + - main + pull_request: + merge_group: + +permissions: # set top-level default permissions as security best practice + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + optional-lint: + name: Optional Lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: misspell # Check spellings as well + uses: reviewdog/action-misspell@5bd7be2fc7ae56a517184f5c4bbcf2fd7afe3927 # v1.17.0 + with: + github_token: ${{ secrets.github_token }} + locale: "US" + reporter: github-pr-check + level: info + filter_mode: diff_context + - name: shellcheck # Static check shell scripts + uses: reviewdog/action-shellcheck@72365a51bf6476fe952a117c3ff703eb7775e40a # v1.20.0 + with: + github_token: ${{ secrets.github_token }} + reporter: github-pr-check + level: info + filter_mode: diff_context + + enforce-style: + name: Enforce style + runs-on: ubuntu-latest + permissions: + security-events: write + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: Setup Python + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + with: + python-version: "3.12" + - name: Install ONNX Neural Compressor + run: | + pip install . + - name: Install dependencies + run: | + python -m pip install lintrunner lintrunner-adapters + lintrunner init + - name: Run lintrunner on all files + run: | + set +e + if ! lintrunner --force-color --all-files --tee-json=lint.json -v; then + echo "" + echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m" + echo -e "\e[1m\e[36mSee https://github.com/onnx/neural-compressor/blob/main/.lintrunner.toml for setup instructions.\e[0m" + exit 1 + fi + - name: Produce SARIF + if: always() + run: | + python -m lintrunner_adapters to-sarif lint.json lintrunner.sarif + - name: Upload SARIF file + # Use always() to always upload SARIF even if lintrunner returns with error code + # To toggle linter comments in the files page, press `i` on the keyboard + if: always() + continue-on-error: true + uses: github/codeql-action/upload-sarif@cdcdbb579706841c47f7063dda365e292e5cad7a # v2.13.4 + with: + # Path to SARIF file relative to the root of the repository + sarif_file: lintrunner.sarif + category: lintrunner + checkout_path: ${{ github.workspace }} diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py index e70c8a188..67c5e9685 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py @@ -15,74 +15,56 @@ # specific language governing permissions and limitations # under the License. # pylint:disable=redefined-outer-name,logging-format-interpolation -import os -import onnx -import time +import argparse import json -import random -import torch import logging -import argparse +import os import random -import numpy as np +import time + import datasets +import numpy as np +import onnx import onnxruntime as ort +import torch import transformers -from torch.nn import functional -from torch.utils import data from intel_extension_for_transformers.transformers.llm.evaluation import lm_eval from optimum import onnxruntime as optimum_ort -from onnx_neural_compressor.quantization import matmul_nbits_quantizer +from torch.nn import functional +from torch.utils import data + from onnx_neural_compressor import config -from onnx_neural_compressor import logger -from onnx_neural_compressor.quantization import tuning from onnx_neural_compressor import data_reader +from onnx_neural_compressor import logger from onnx_neural_compressor import utility +from onnx_neural_compressor.quantization import matmul_nbits_quantizer +from onnx_neural_compressor.quantization import tuning -logger = logging.getLogger(__name__) logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.WARN) - -parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("--model_path", - type=str, - help="Folder path of pre-trained onnx model") -parser.add_argument( - "--benchmark", - action="store_true", \ - default=False + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN ) -parser.add_argument( - "--tune", - action="store_true", \ - default=False, - help="whether quantize the model" -) -parser.add_argument("--output_model", - type=str, - default=None, - help="output model path") + +parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("--model_path", type=str, help="Folder path of pre-trained onnx model") +parser.add_argument("--benchmark", action="store_true", default=False) +parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model") +parser.add_argument("--output_model", type=str, default=None, help="output model path") parser.add_argument( "--batch_size", default=1, type=int, ) -parser.add_argument("--tokenizer", - type=str, - help="pretrained model name or path of tokenizer files", - default="meta-llama/Llama-2-7b-hf") -parser.add_argument("--workspace", - type=str, - help="workspace to save intermediate files", - default="nc_workspace") -parser.add_argument("--algorithm", - type=str, - default="WOQ_TUNE", - choices=["WOQ_TUNE", "RTN", "AWQ", "GPTQ"], - help="weight only algorithm") +parser.add_argument( + "--tokenizer", type=str, help="pretrained model name or path of tokenizer files", default="meta-llama/Llama-2-7b-hf" +) +parser.add_argument("--workspace", type=str, help="workspace to save intermediate files", default="nc_workspace") +parser.add_argument( + "--algorithm", + type=str, + default="WOQ_TUNE", + choices=["WOQ_TUNE", "RTN", "AWQ", "GPTQ"], + help="weight only algorithm", +) parser.add_argument( "--pad_max", default=196, @@ -96,18 +78,22 @@ parser.add_argument( "--tasks", nargs="+", - default=["winogrande", "copa", "piqa", "rte", "hellaswag", "openbookqa", \ - "lambada_openai", "lambada_standard", "wikitext"], + default=[ + "winogrande", + "copa", + "piqa", + "rte", + "hellaswag", + "openbookqa", + "lambada_openai", + "lambada_standard", + "wikitext", + ], type=str, - help="tasks list for accuracy validation" + help="tasks list for accuracy validation", ) -parser.add_argument("--dataset", - nargs="?", - default="NeelNanda/pile-10k", - const="NeelNanda/pile-10k") -parser.add_argument('--mode', - type=str, - help="benchmark mode of performance or accuracy") +parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k") +parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy") parser.add_argument("--intra_op_num_threads", type=int, default=24) args = parser.parse_args() @@ -129,7 +115,7 @@ def replace_architectures(json_path): data = json.load(file) data["architectures"] = ["LlamaForCausalLM"] - with open(json_path, 'w') as file: + with open(json_path, "w") as file: json.dump(data, file, indent=4) @@ -151,12 +137,10 @@ def eval_func(model): eval_acc = 0 for task_name in args.tasks: if task_name == "wikitext": - print("Accuracy for %s is: %s" % - (task_name, results["results"][task_name]["word_perplexity"])) + print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"])) eval_acc += results["results"][task_name]["word_perplexity"] else: - print("Accuracy for %s is: %s" % - (task_name, results["results"][task_name]["acc"])) + print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"])) eval_acc += results["results"][task_name]["acc"] if len(args.tasks) != 0: @@ -170,14 +154,10 @@ def benchmark(model): sess_options.intra_op_num_threads = args.intra_op_num_threads session = optimum_ort.ORTModelForCausalLM.load_model( # pylint: disable=E1123 - os.path.join(model, "model.onnx"), - session_options=sess_options) + os.path.join(model, "model.onnx"), session_options=sess_options + ) inputs_names = session.get_inputs() - key_value_input_names = [ - key.name - for key in inputs_names - if (".key" in key.name) or (".value" in key.name) - ] + key_value_input_names = [key.name for key in inputs_names if (".key" in key.name) or (".value" in key.name)] use_cache = len(key_value_input_names) > 0 model = optimum_ort.ORTModelForCausalLM( @@ -219,19 +199,13 @@ def benchmark(model): class AWQDataloader(data_reader.CalibrationDataReader): - def __init__(self, - model_path, - pad_max=196, - batch_size=1, - sub_folder='train', - calibration_sampling_size=8): + def __init__(self, model_path, pad_max=196, batch_size=1, sub_folder="train", calibration_sampling_size=8): self.encoded_list = [] self.pad_max = pad_max self.batch_size = batch_size dataset = datasets.load_dataset(args.dataset, split=sub_folder) dataset = dataset.map(tokenize_function, batched=True) - dataset.set_format(type="torch", - columns=["input_ids", "attention_mask"]) + dataset.set_format(type="torch", columns=["input_ids", "attention_mask"]) dataloader = data.DataLoader( dataset, batch_size=self.batch_size, @@ -240,9 +214,7 @@ def __init__(self, ) model = onnx.load(model_path, load_external_data=False) inputs_names = [input.name for input in model.graph.input] - key_value_input_names = [ - key for key in inputs_names if (".key" in key) or (".value" in key) - ] + key_value_input_names = [key for key in inputs_names if (".key" in key) or (".value" in key)] use_cache = len(key_value_input_names) > 0 self.batch_size = batch_size @@ -250,20 +222,16 @@ def __init__(self, if idx + 1 > calibration_sampling_size: break ort_input = {} - ort_input["input_ids"] = input_ids[:, :-1].detach().cpu().numpy( - ).astype("int64") - ort_input["attention_mask"] = attention_mask[:, :-1].detach().cpu( - ).numpy().astype("int64") + ort_input["input_ids"] = input_ids[:, :-1].detach().cpu().numpy().astype("int64") + ort_input["attention_mask"] = attention_mask[:, :-1].detach().cpu().numpy().astype("int64") position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) - ort_input["position_ids"] = position_ids[:, :-1].detach().cpu( - ).numpy().astype("int64") + ort_input["position_ids"] = position_ids[:, :-1].detach().cpu().numpy().astype("int64") if use_cache: # Create dummy past_key_values for decoder num_attention_heads = model_config.num_key_value_heads embed_size_per_head = model_config.hidden_size // model_config.num_attention_heads - shape = (self.batch_size, num_attention_heads, 0, - embed_size_per_head) + shape = (self.batch_size, num_attention_heads, 0, embed_size_per_head) key_or_value = np.zeros(shape, dtype=np.float32) for key_value_input_name in key_value_input_names: ort_input[key_value_input_name] = key_or_value @@ -284,8 +252,7 @@ def collate_batch(self, batch): attention_mask = functional.pad(attention_mask, (0, pad_len), value=0) input_ids_padded.append(input_ids) attention_mask_padded.append(attention_mask) - return torch.vstack(input_ids_padded), torch.vstack( - attention_mask_padded) + return torch.vstack(input_ids_padded), torch.vstack(attention_mask_padded) def get_next(self): return next(self.iter_next, None) @@ -296,26 +263,18 @@ def rewind(self): class GPTQDataloader(data_reader.CalibrationDataReader): - def __init__(self, - model_path, - batch_size=1, - seqlen=2048, - sub_folder="train", - calibration_sampling_size=8): + def __init__(self, model_path, batch_size=1, seqlen=2048, sub_folder="train", calibration_sampling_size=8): random.seed(0) self.encoded_list = [] self.batch_size = batch_size traindata = datasets.load_dataset(args.dataset, split=sub_folder) traindata = traindata.map(tokenize_function, batched=True) - traindata.set_format(type="torch", - columns=["input_ids", "attention_mask"]) + traindata.set_format(type="torch", columns=["input_ids", "attention_mask"]) session = ort.InferenceSession(model_path) inputs_names = [input.name for input in session.get_inputs()] - key_value_input_names = [ - key for key in inputs_names if (".key" in key) or (".value" in key) - ] + key_value_input_names = [key for key in inputs_names if (".key" in key) or (".value" in key)] use_cache = len(key_value_input_names) > 0 for i in range(calibration_sampling_size): @@ -331,19 +290,15 @@ def __init__(self, ort_input = {} ort_input["input_ids"] = inp.detach().cpu().numpy().astype("int64") - ort_input["attention_mask"] = mask.detach().cpu().numpy().astype( - "int64") + ort_input["attention_mask"] = mask.detach().cpu().numpy().astype("int64") input_shape = ort_input["input_ids"].shape - position_ids = torch.arange(0, input_shape[-1], - dtype=torch.long).unsqueeze(0).view( - -1, input_shape[-1]) + position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) ort_input["position_ids"] = position_ids.numpy() if use_cache: # create dummy past_key_values for decoder first generation step num_attention_heads = model_config.num_key_value_heads embed_size_per_head = model_config.hidden_size // model_config.num_attention_heads - shape = (self.batch_size, num_attention_heads, 0, - embed_size_per_head) + shape = (self.batch_size, num_attention_heads, 0, embed_size_per_head) key_or_value = np.zeros(shape, dtype=np.float32) for key_value_input_name in key_value_input_names: ort_input[key_value_input_name] = key_or_value @@ -364,9 +319,9 @@ def rewind(self): os.mkdir(args.workspace) if args.benchmark: - if args.mode == 'performance': + if args.mode == "performance": benchmark(args.model_path) - elif args.mode == 'accuracy': + elif args.mode == "accuracy": acc_result = eval_func(args.model_path) print("Batch size = %d" % args.batch_size) print("Accuracy: %.5f" % acc_result) @@ -379,17 +334,12 @@ def rewind(self): logger.info("Start graph optimization...") sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED - sess_options.optimized_model_filepath = os.path.join( - args.workspace, "Optimized_model.onnx") - sess_options.add_session_config_entry( - "session.optimized_model_external_initializers_file_name", - "Optimized_model.onnx_data") + sess_options.optimized_model_filepath = os.path.join(args.workspace, "Optimized_model.onnx") sess_options.add_session_config_entry( - "session.optimized_model_external_initializers_min_size_in_bytes", - "1024") - sess = ort.InferenceSession(model_path, - sess_options, - providers=["CPUExecutionProvider"]) + "session.optimized_model_external_initializers_file_name", "Optimized_model.onnx_data" + ) + sess_options.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "1024") + sess = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"]) logger.info("Graph optimization done.") best_model = None @@ -406,12 +356,10 @@ def rewind(self): best_model = quant.model elif args.algorithm.upper() == "AWQ": - calibration_data_reader = AWQDataloader(model_path, - pad_max=args.pad_max, - batch_size=1) + calibration_data_reader = AWQDataloader(model_path, pad_max=args.pad_max, batch_size=1) algo_config = matmul_nbits_quantizer.AWQWeightOnlyQuantConfig( - calibration_data_reader=calibration_data_reader, - enable_mse_search=False) + calibration_data_reader=calibration_data_reader, enable_mse_search=False + ) quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( sess_options.optimized_model_filepath, n_bits=4, @@ -423,11 +371,10 @@ def rewind(self): best_model = quant.model elif args.algorithm.upper() == "GPTQ": - calibration_data_reader = GPTQDataloader(model_path, - seqlen=args.seqlen, - batch_size=1) + calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1) algo_config = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig( - calibration_data_reader=calibration_data_reader,) + calibration_data_reader=calibration_data_reader, + ) quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( sess_options.optimized_model_filepath, n_bits=4, @@ -439,12 +386,9 @@ def rewind(self): best_model = quant.model elif args.algorithm.upper() == "WOQ_TUNE": - calibration_data_reader = GPTQDataloader(model_path, - seqlen=args.seqlen, - batch_size=1) + calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1) # set tolerable_loss to 0.5% for test, default is 1% - custom_tune_config = tuning.TuningConfig( - config_set=config.get_woq_tuning_config(), tolerable_loss=0.005) + custom_tune_config = tuning.TuningConfig(config_set=config.get_woq_tuning_config(), tolerable_loss=0.005) best_model = tuning.autotune( model_input=model_path, tune_config=custom_tune_config, @@ -458,5 +402,4 @@ def rewind(self): os.path.join(args.output_model, model_name), save_as_external_data=True, ) - model_config.to_json_file(os.path.join(args.output_model, "config.json"), - use_diff=False) + model_config.to_json_file(os.path.join(args.output_model, "config.json"), use_diff=False) diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py index 188f02a5b..3af820943 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py @@ -1,6 +1,7 @@ import argparse import os import subprocess + import optimum.version from packaging import version @@ -16,7 +17,8 @@ def parse_arguments(): type=str, required=False, default="text-generation-with-past", - choices=["text-generation-with-past", "text-generation"]) + choices=["text-generation-with-past", "text-generation"], + ) return parser.parse_args() diff --git a/onnx_neural_compressor/algorithms/layer_wise/core.py b/onnx_neural_compressor/algorithms/layer_wise/core.py index 1b5cb680e..1de8f3f40 100644 --- a/onnx_neural_compressor/algorithms/layer_wise/core.py +++ b/onnx_neural_compressor/algorithms/layer_wise/core.py @@ -23,6 +23,7 @@ import onnx import onnxruntime as ort + from onnx_neural_compressor import data_reader from onnx_neural_compressor import logger from onnx_neural_compressor import onnx_model diff --git a/onnx_neural_compressor/algorithms/smoother/calibrator.py b/onnx_neural_compressor/algorithms/smoother/calibrator.py index 7fddd2cc9..7fedbdae1 100644 --- a/onnx_neural_compressor/algorithms/smoother/calibrator.py +++ b/onnx_neural_compressor/algorithms/smoother/calibrator.py @@ -22,6 +22,7 @@ import numpy as np import onnx import onnxruntime + from onnx_neural_compressor import data_reader from onnx_neural_compressor import logger from onnx_neural_compressor import onnx_model diff --git a/onnx_neural_compressor/algorithms/smoother/core.py b/onnx_neural_compressor/algorithms/smoother/core.py index d30f78003..69494f92f 100644 --- a/onnx_neural_compressor/algorithms/smoother/core.py +++ b/onnx_neural_compressor/algorithms/smoother/core.py @@ -20,6 +20,7 @@ import numpy as np import onnx import onnxruntime as ort + from onnx_neural_compressor import data_reader from onnx_neural_compressor import logger from onnx_neural_compressor import onnx_model diff --git a/onnx_neural_compressor/algorithms/weight_only/awq.py b/onnx_neural_compressor/algorithms/weight_only/awq.py index b3521b8d3..ed689c3ed 100644 --- a/onnx_neural_compressor/algorithms/weight_only/awq.py +++ b/onnx_neural_compressor/algorithms/weight_only/awq.py @@ -22,6 +22,8 @@ import numpy as np import onnx import onnxruntime as ort +from packaging import version + from onnx_neural_compressor import config from onnx_neural_compressor import constants from onnx_neural_compressor import data_reader @@ -30,7 +32,6 @@ from onnx_neural_compressor import utility from onnx_neural_compressor.algorithms.weight_only import rtn from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility -from packaging import version from typing import List, Union # isort: skip diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py index 07cc4cd1f..b5ac367bc 100644 --- a/onnx_neural_compressor/algorithms/weight_only/gptq.py +++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py @@ -22,6 +22,8 @@ import numpy as np import onnx import onnxruntime as ort +from packaging.version import Version + from onnx_neural_compressor import config from onnx_neural_compressor import constants from onnx_neural_compressor import data_reader @@ -29,7 +31,6 @@ from onnx_neural_compressor import utility from onnx_neural_compressor.algorithms.layer_wise import core from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility -from packaging.version import Version from typing import List, Union # isort: skip diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py index 8deb39f14..590f5305b 100644 --- a/onnx_neural_compressor/algorithms/weight_only/rtn.py +++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py @@ -24,13 +24,14 @@ import numpy as np import onnx import onnxruntime as ort +from packaging import version + from onnx_neural_compressor import config from onnx_neural_compressor import constants from onnx_neural_compressor import onnx_model from onnx_neural_compressor import utility from onnx_neural_compressor.algorithms.layer_wise import core from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility -from packaging import version from typing import List, Union # isort: skip diff --git a/onnx_neural_compressor/algorithms/weight_only/utility.py b/onnx_neural_compressor/algorithms/weight_only/utility.py index 0a6e59d4b..f9c72f41b 100644 --- a/onnx_neural_compressor/algorithms/weight_only/utility.py +++ b/onnx_neural_compressor/algorithms/weight_only/utility.py @@ -25,9 +25,10 @@ import numpy as np import onnx import onnxruntime as ort +from packaging import version + from onnx_neural_compressor import constants from onnx_neural_compressor import utility -from packaging import version if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): # pragma: no cover import onnxruntime_extensions @@ -118,8 +119,8 @@ def make_matmul_weight_only_node( even_idx = idx[::2] odd_idx = idx[1::2] # vectorized operation for even and odd indices - packed_zp[even_idx // 2] = ((packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel()) - packed_zp[odd_idx // 2] = ((packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4)) + packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel() + packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4) zp_tensor = onnx.helper.make_tensor( name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True @@ -281,7 +282,7 @@ def quant_tensor( max_range = np.maximum(np.abs(rmin), np.abs(rmax)) scale = np.ones(rmax.shape) - mask = (max_range > 0) + mask = max_range > 0 scale[mask] = (max_range[mask] * 2.0).astype(np.float64) / (maxq - minq) zero_point = ( np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1)) @@ -304,6 +305,7 @@ def quant_tensor( return q_weight, scale, zero_point + def qdq_tensor( data: np.array, num_bits: int = 4, diff --git a/onnx_neural_compressor/config.py b/onnx_neural_compressor/config.py index 61ab8fc67..cf17c8d38 100644 --- a/onnx_neural_compressor/config.py +++ b/onnx_neural_compressor/config.py @@ -29,12 +29,13 @@ import numpy as np import onnx import pydantic +from onnxruntime import quantization +from typing_extensions import Self + from onnx_neural_compressor import constants from onnx_neural_compressor import data_reader from onnx_neural_compressor import logger from onnx_neural_compressor import utility -from onnxruntime import quantization -from typing_extensions import Self from collections import OrderedDict # isort: skip from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, _GenericAlias # isort: skip @@ -1239,4 +1240,4 @@ def generate_nc_sq_config(quant_config: quantization.StaticQuantConfig): quant_config.extra_options["SmoothQuant"] = False quant_config_dict = quant_config.to_dict() nc_sq_config = SmoothQuantConfig(**quant_kwargs, **quant_config_dict) - return nc_sq_config \ No newline at end of file + return nc_sq_config diff --git a/onnx_neural_compressor/onnx_model.py b/onnx_neural_compressor/onnx_model.py index 2db10353f..d186641cf 100644 --- a/onnx_neural_compressor/onnx_model.py +++ b/onnx_neural_compressor/onnx_model.py @@ -21,10 +21,11 @@ import onnx import transformers +from onnxruntime.quantization import onnx_model + from onnx_neural_compressor import constants from onnx_neural_compressor import logger from onnx_neural_compressor import utility -from onnxruntime.quantization import onnx_model class ONNXModel(onnx_model.ONNXModel): diff --git a/onnx_neural_compressor/quantization/__init__.py b/onnx_neural_compressor/quantization/__init__.py index 7245f8724..a25142b13 100644 --- a/onnx_neural_compressor/quantization/__init__.py +++ b/onnx_neural_compressor/quantization/__init__.py @@ -15,4 +15,5 @@ from onnxruntime.quantization.quant_utils import QuantFormat from onnxruntime.quantization.quant_utils import QuantType + from onnx_neural_compressor.quantization.quantize import quantize diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py index 982ea3a14..706275440 100644 --- a/onnx_neural_compressor/quantization/algorithm_entry.py +++ b/onnx_neural_compressor/quantization/algorithm_entry.py @@ -17,6 +17,8 @@ from typing import Union import onnx +from onnxruntime import quantization + from onnx_neural_compressor import config from onnx_neural_compressor import constants from onnx_neural_compressor import data_reader @@ -26,7 +28,6 @@ from onnx_neural_compressor.algorithms.weight_only import awq from onnx_neural_compressor.algorithms.weight_only import gptq from onnx_neural_compressor.algorithms.weight_only import rtn -from onnxruntime import quantization ###################### SmoothQuant Entry ################################## diff --git a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py index ea1cf62a9..62a671fba 100644 --- a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py +++ b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py @@ -15,9 +15,10 @@ from typing import List, Union # isort: skip import onnx -from onnx_neural_compressor.quantization import matmul_nbits_quantizer from onnxruntime.quantization import matmul_4bits_quantizer +from onnx_neural_compressor.quantization import matmul_nbits_quantizer + RTNWeightOnlyQuantConfig = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig AWQWeightOnlyQuantConfig = matmul_nbits_quantizer.AWQWeightOnlyQuantConfig GPTQWeightOnlyQuantConfig = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig diff --git a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py index b1a4d7488..cc69515d4 100644 --- a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py +++ b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py @@ -15,13 +15,14 @@ from typing import List, Union # isort: skip import onnx +from onnxruntime.quantization import matmul_4bits_quantizer + from onnx_neural_compressor import config from onnx_neural_compressor import data_reader from onnx_neural_compressor import logger from onnx_neural_compressor import onnx_model from onnx_neural_compressor import utility from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnxruntime.quantization import matmul_4bits_quantizer class RTNWeightOnlyQuantConfig(matmul_4bits_quantizer.RTNWeightOnlyQuantConfig): diff --git a/onnx_neural_compressor/quantization/quantize.py b/onnx_neural_compressor/quantization/quantize.py index f586655dc..7e388e3aa 100644 --- a/onnx_neural_compressor/quantization/quantize.py +++ b/onnx_neural_compressor/quantization/quantize.py @@ -16,9 +16,10 @@ from typing import Union import onnx +from onnxruntime.quantization.quantize import QuantConfig + from onnx_neural_compressor import config from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnxruntime.quantization.quantize import QuantConfig # ORT-like user-facing API diff --git a/onnx_neural_compressor/quantization/tuning.py b/onnx_neural_compressor/quantization/tuning.py index 91e7eae14..8351d868f 100644 --- a/onnx_neural_compressor/quantization/tuning.py +++ b/onnx_neural_compressor/quantization/tuning.py @@ -19,6 +19,7 @@ import uuid import onnx + from onnx_neural_compressor import config from onnx_neural_compressor import data_reader from onnx_neural_compressor import logger diff --git a/onnx_neural_compressor/utility.py b/onnx_neural_compressor/utility.py index 0cb7b1335..d234d66e0 100644 --- a/onnx_neural_compressor/utility.py +++ b/onnx_neural_compressor/utility.py @@ -23,9 +23,10 @@ import numpy as np import onnx import psutil +from onnxruntime.quantization import onnx_model + from onnx_neural_compressor import constants from onnx_neural_compressor import logger -from onnxruntime.quantization import onnx_model from typing import Callable, Dict, List, Tuple, Union # isort: skip diff --git a/pyproject.toml b/pyproject.toml index 06b02dfe1..ce25be2f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,6 @@ [tool.isort] profile = "black" line_length = 120 -known_first_party = ["neural_compressor"] extend_skip_glob = ["**/__init__.py"] force_single_line = true diff --git a/setup.py b/setup.py index cdc3d0479..c80178535 100644 --- a/setup.py +++ b/setup.py @@ -49,8 +49,16 @@ def get_build_version(): url="", packages=setuptools.find_packages(), include_package_data=True, - install_requires=["onnx", "onnxruntime", "onnxruntime-extensions", "psutil", "numpy", - "py-cpuinfo", "pydantic", "transformers"], + install_requires=[ + "onnx", + "onnxruntime", + "onnxruntime-extensions", + "psutil", + "numpy", + "py-cpuinfo", + "pydantic", + "transformers", + ], python_requires=">=3.8.0", classifiers=[ "Intended Audience :: Science/Research", diff --git a/test/quantization/layer_wise/test_layer_wise.py b/test/quantization/layer_wise/test_layer_wise.py index 994387eb4..7b0f47cf5 100644 --- a/test/quantization/layer_wise/test_layer_wise.py +++ b/test/quantization/layer_wise/test_layer_wise.py @@ -8,12 +8,13 @@ import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer import torch import transformers +from optimum.exporters.onnx import main_export + from onnx_neural_compressor import config from onnx_neural_compressor import data_reader from onnx_neural_compressor import logger from onnx_neural_compressor.quantization import algorithm_entry as algos from onnx_neural_compressor.quantization import matmul_4bits_quantizer -from optimum.exporters.onnx import main_export def find_onnx_file(folder_path): diff --git a/test/quantization/test_autotune.py b/test/quantization/test_autotune.py index 031b8369e..5f465155a 100644 --- a/test/quantization/test_autotune.py +++ b/test/quantization/test_autotune.py @@ -24,10 +24,11 @@ import numpy as np import onnx import onnxruntime as ort +from optimum.exporters.onnx import main_export + from onnx_neural_compressor import config from onnx_neural_compressor import data_reader from onnx_neural_compressor.quantization import tuning -from optimum.exporters.onnx import main_export from typing import Callable, Dict, List, Optional, Union # isort: skip diff --git a/test/quantization/test_config.py b/test/quantization/test_config.py index a7e142978..37e34835d 100644 --- a/test/quantization/test_config.py +++ b/test/quantization/test_config.py @@ -5,11 +5,12 @@ import numpy as np import onnx +from optimum.exporters.onnx import main_export + from onnx_neural_compressor import config from onnx_neural_compressor import logger from onnx_neural_compressor import utility from onnx_neural_compressor.quantization import algorithm_entry as algos -from optimum.exporters.onnx import main_export def find_onnx_file(folder_path): diff --git a/test/quantization/test_smooth_quant.py b/test/quantization/test_smooth_quant.py index 56962af85..284c8be29 100644 --- a/test/quantization/test_smooth_quant.py +++ b/test/quantization/test_smooth_quant.py @@ -21,12 +21,13 @@ import numpy as np import onnx +from optimum.exporters.onnx import main_export + from onnx_neural_compressor import config from onnx_neural_compressor import data_reader from onnx_neural_compressor.quantization import QuantType from onnx_neural_compressor.quantization import algorithm_entry as algos from onnx_neural_compressor.quantization import quantize -from optimum.exporters.onnx import main_export class DataReader(data_reader.CalibrationDataReader): diff --git a/test/quantization/weight_only/test_awq.py b/test/quantization/weight_only/test_awq.py index 82a003791..86cbdc25a 100644 --- a/test/quantization/weight_only/test_awq.py +++ b/test/quantization/weight_only/test_awq.py @@ -6,13 +6,14 @@ import torch import transformers +from optimum.exporters.onnx import main_export + from onnx_neural_compressor import config from onnx_neural_compressor import data_reader from onnx_neural_compressor import logger from onnx_neural_compressor.quantization import algorithm_entry as algos from onnx_neural_compressor.quantization import matmul_4bits_quantizer from onnx_neural_compressor.quantization import matmul_nbits_quantizer -from optimum.exporters.onnx import main_export def find_onnx_file(folder_path): diff --git a/test/quantization/weight_only/test_gptq.py b/test/quantization/weight_only/test_gptq.py index cc5df2cf9..a1434b4e2 100644 --- a/test/quantization/weight_only/test_gptq.py +++ b/test/quantization/weight_only/test_gptq.py @@ -6,13 +6,14 @@ import torch import transformers +from optimum.exporters.onnx import main_export + from onnx_neural_compressor import config from onnx_neural_compressor import data_reader from onnx_neural_compressor import logger from onnx_neural_compressor.quantization import algorithm_entry as algos from onnx_neural_compressor.quantization import matmul_4bits_quantizer from onnx_neural_compressor.quantization import matmul_nbits_quantizer -from optimum.exporters.onnx import main_export def find_onnx_file(folder_path): diff --git a/test/quantization/weight_only/test_rtn.py b/test/quantization/weight_only/test_rtn.py index 7f75edc41..3ae65f8b9 100644 --- a/test/quantization/weight_only/test_rtn.py +++ b/test/quantization/weight_only/test_rtn.py @@ -4,12 +4,13 @@ import shutil import unittest +from optimum.exporters.onnx import main_export + from onnx_neural_compressor import config from onnx_neural_compressor import logger from onnx_neural_compressor.quantization import algorithm_entry as algos from onnx_neural_compressor.quantization import matmul_4bits_quantizer from onnx_neural_compressor.quantization import matmul_nbits_quantizer -from optimum.exporters.onnx import main_export def find_onnx_file(folder_path):