From 6ec5767a48903419a0413fec34183dc4b1c9dc52 Mon Sep 17 00:00:00 2001 From: Hitesh Shah Date: Fri, 13 Sep 2024 20:07:53 +0000 Subject: [PATCH] Quantize: CLI command to quantize input model Usage: olive quantize --m --device --algorithms --data_config_path -o Few other code improvements: * Moved global function is cli/base.py to be static members of cli/base/BaseOliveCLICommand to avoid multiple imports in each cli command implementation. Moreover these functions are only useable in the context of cli command implementation anyways. * Created new new functions (add_data_config_options, add_hf_dataset_options, and add_accelerator_options) to cli/base/BaseOliveCLICommand to avoid code duplication and standardization across different cli command implementations. --- olive/cli/base.py | 475 +++++++++++++++++++++++--------- olive/cli/capture_onnx.py | 30 +- olive/cli/export_adapters.py | 2 +- olive/cli/finetune.py | 88 ++---- olive/cli/launcher.py | 2 + olive/cli/manage_aml_compute.py | 18 +- olive/cli/perf_tuning.py | 123 ++------- olive/cli/quantize.py | 153 ++++++++++ olive/common/hf/mappings.py | 13 + olive/passes/pytorch/gptq.py | 31 ++- test/unit_test/cli/test_cli.py | 42 +++ 11 files changed, 632 insertions(+), 345 deletions(-) create mode 100644 olive/cli/quantize.py diff --git a/olive/cli/base.py b/olive/cli/base.py index e697fff3e..37e63f2c8 100644 --- a/olive/cli/base.py +++ b/olive/cli/base.py @@ -16,7 +16,7 @@ import yaml from olive.cli.constants import CONDA_CONFIG -from olive.common.utils import hash_dict +from olive.common.utils import hash_dict, set_nested_dict_value, unescaped_str class BaseOliveCLICommand(ABC): @@ -38,132 +38,353 @@ def register_subcommand(parser: ArgumentParser): def run(self): raise NotImplementedError + @staticmethod + def _get_model_name_or_path(model_name_or_path) -> Union[str, Dict[str, str]]: + pattern = r"^(?P[^:]+):(?P[^:]+):(?P[^:]+)$" + match = re.match(pattern, model_name_or_path) + + if match: + return { + "type": "azureml_registry_model", + "registry_name": match.group("registry_name"), + "name": match.group("model_name"), + "version": match.group("version"), + } + + pattern = r"https://huggingface\.co/([^/]+/[^/]+)(?:/.*)?" + match = re.search(pattern, model_name_or_path) + + if match: + return match.group(1) + + return model_name_or_path -def get_model_name_or_path(model_name_or_path) -> Union[str, Dict[str, str]]: - pattern = r"^(?P[^:]+):(?P[^:]+):(?P[^:]+)$" - match = re.match(pattern, model_name_or_path) - - if match: - return { - "type": "azureml_registry_model", - "registry_name": match.group("registry_name"), - "name": match.group("model_name"), - "version": match.group("version"), - } - - pattern = r"https://huggingface\.co/([^/]+/[^/]+)(?:/.*)?" - match = re.search(pattern, model_name_or_path) - - if match: - return match.group(1) - - return model_name_or_path - - -def add_logging_options(sub_parser): - log_group = sub_parser.add_argument_group("logging options") - log_group.add_argument( - "--log_level", - type=int, - default=3, - help="Logging level. Default is 3. level 0: DEBUG, 1: INFO, 2: WARNING, 3: ERROR, 4: CRITICAL", - ) - - -def add_remote_options(sub_parser): - remote_group = sub_parser.add_argument_group("remote options") - remote_group.add_argument( - "--resource_group", - type=str, - required=False, - help="Resource group for the AzureML workspace.", - ) - remote_group.add_argument( - "--workspace_name", - type=str, - required=False, - help="Workspace name for the AzureML workspace.", - ) - remote_group.add_argument( - "--keyvault_name", - type=str, - required=False, - help=( - "The azureml keyvault name with huggingface token to use for remote run. Refer to" - " https://microsoft.github.io/Olive/features/huggingface_model_optimization.html#huggingface-login for" - " more details." - ), - ) - remote_group.add_argument( - "--aml_compute", - type=str, - required=False, - help="The compute name to run the workflow on.", - ) - - -def add_hf_model_options(sub_parser): - model_group = sub_parser.add_argument_group("model options") - model_group.add_argument( - "-m", - "--model_name_or_path", - type=str, - required=True, - help=( - "The model checkpoint for weights initialization. If using an AzureML Registry model, provide the model" - " path as 'registry_name:model_name:version'." - ), - ) - model_group.add_argument("--trust_remote_code", action="store_true", help="Trust remote code when loading a model.") - model_group.add_argument("-t", "--task", type=str, help="Task for which the model is used.") - - -def is_remote_run(args): - return all([args.resource_group, args.workspace_name, args.aml_compute]) - - -def update_remote_option(config, args, cli_action, tempdir): - if args.resource_group or args.workspace_name or args.aml_compute: - if not is_remote_run(args): - raise ValueError("resource_group, workspace_name and aml_compute are required for remote workflow run.") - - config["workflow_id"] = f"{cli_action}-{hash_dict(config)}" - - try: - subscription_id = json.loads(subprocess.check_output("az account show", shell=True).decode("utf-8"))["id"] - print("Using Azure subscription ID: %s", subscription_id) - - except subprocess.CalledProcessError: - print( - "Error: Unable to retrieve account information. " - "Make sure you are logged in to Azure CLI with command `az login`." + @staticmethod + def _add_logging_options(sub_parser): + log_group = sub_parser.add_argument_group("logging options") + log_group.add_argument( + "--log_level", + type=int, + default=3, + help="Logging level. Default is 3. level 0: DEBUG, 1: INFO, 2: WARNING, 3: ERROR, 4: CRITICAL", + ) + return log_group + + @staticmethod + def _add_remote_options(sub_parser): + remote_group = sub_parser.add_argument_group("remote options") + remote_group.add_argument( + "--resource_group", + type=str, + required=False, + help="Resource group for the AzureML workspace.", + ) + remote_group.add_argument( + "--workspace_name", + type=str, + required=False, + help="Workspace name for the AzureML workspace.", + ) + remote_group.add_argument( + "--keyvault_name", + type=str, + required=False, + help=( + "The azureml keyvault name with huggingface token to use for remote run. Refer to" + " https://microsoft.github.io/Olive/features/huggingface_model_optimization.html#huggingface-login for" + " more details." + ), + ) + remote_group.add_argument( + "--aml_compute", + type=str, + required=False, + help="The compute name to run the workflow on.", + ) + + return remote_group + + @staticmethod + def _add_hf_model_options(sub_parser): + model_group = sub_parser.add_argument_group("model options") + model_group.add_argument( + "-m", + "--model_name_or_path", + type=str, + required=True, + help=( + "The model checkpoint for weights initialization. If using an AzureML Registry model, provide the model" + " path as 'registry_name:model_name:version'." + ), + ) + model_group.add_argument( + "--trust_remote_code", action="store_true", help="Trust remote code when loading a model." + ) + model_group.add_argument("-t", "--task", type=str, help="Task for which the model is used.") + + return model_group + + @staticmethod + def _add_dataconfig_options(sub_parser): + dataconfig_group = sub_parser.add_argument_group( + "data config options, which mutually exclusive with huggingface dataset options" + ) + dataconfig_group.add_argument( + "--data_config_path", + type=str, + help="Path to the data config file. It allows to customize the data config(json/yaml) for the model.", + ) + + return dataconfig_group + + @staticmethod + def _add_dataset_options(sub_parser): + dataset_group = sub_parser.add_argument_group("dataset options") + dataset_group.add_argument( + "-d", + "--data_name", + type=str, + required=True, + help="The dataset name.", + ) + dataset_group.add_argument("--train_subset", type=str, help="The subset to use for training.") + dataset_group.add_argument("--eval_subset", type=str, help="The subset to use for evaluation.") + # TODO(jambayk): currently only supports single file or list of files, support mapping + dataset_group.add_argument( + "--data_files", type=str, help="The dataset files. If multiple files, separate by comma." + ) + dataset_group.add_argument("--train_split", type=str, default="train", help="The split to use for training.") + dataset_group.add_argument( + "--eval_split", + default="", + help="The dataset split to evaluate on.", + ) + text_group = dataset_group.add_mutually_exclusive_group(required=False) + text_group.add_argument( + "--text_field", + type=str, + help="The text field to use for fine-tuning.", + ) + text_group.add_argument( + "--text_template", + # using special string type to allow for escaped characters like \n + type=unescaped_str, + help=r"Template to generate text field from. E.g. '### Question: {prompt} \n### Answer: {response}'", + ) + dataset_group.add_argument( + "--max_seq_len", + type=int, + default=1024, + help="Maximum sequence length for the data.", + ) + dataset_group.add_argument( + "--add_special_tokens", + type=bool, + default=False, + help="Whether to add special tokens during preprocessing.", + ) + dataset_group.add_argument( + "--max_samples", + type=int, + default=256, + help="Maximum samples to select from the dataset.", + ) + dataset_group.add_argument( + "--batch_size", + type=int, + default=1, + help="Batch size.", + ) + + return dataset_group, text_group + + def _update_dataset_options(self, config): + load_key = ("data_configs", 0, "load_dataset_config") + preprocess_key = ("data_configs", 0, "pre_process_data_config") + dataloader_key = ("data_configs", 0, "dataloader_config") + to_replace = [ + ((*load_key, "data_name"), self.args.data_name), + ((*load_key, "split"), self.args.train_split), + ((*load_key, "subset"), self.args.train_subset), + ( + (*load_key, "data_files"), + self.args.data_files.split(",") if self.args.data_files else None, + ), + ((*preprocess_key, "text_cols"), self.args.text_field), + ((*preprocess_key, "text_template"), self.args.text_template), + ((*preprocess_key, "max_seq_len"), self.args.max_seq_len), + ((*preprocess_key, "add_special_tokens"), self.args.add_special_tokens), + ((*preprocess_key, "max_samples"), self.args.max_samples), + ((*dataloader_key, "batch_size"), self.args.batch_size), + ] + for keys, value in to_replace: + if value is not None: + set_nested_dict_value(config, keys, value) + + @staticmethod + def _add_hf_dataset_options(sub_parser): + hf_dataset_group = sub_parser.add_argument_group( + "huggingface dataset options, if dataset options are not provided, " + "user should provide the following options to modify the default data config. " + "Please refer to olive.data.container.TransformersTokenDummyDataContainer for more details." + ) + hf_dataset_group.add_argument( + "--hf_model_name", + help="Huggingface model name used to load model configs from huggingface.", + ) + hf_dataset_group.add_argument( + "--batch_size", + type=int, + help="Batch size of the input data.", + ) + hf_dataset_group.add_argument( + "--seq_len", + type=int, + help="Sequence length to use for the input data.", + ) + hf_dataset_group.add_argument( + "--past_seq_len", + type=int, + help="Past sequence length to use for the input data.", + ) + hf_dataset_group.add_argument( + "--max_seq_len", + type=int, + help="Max sequence length to use for the input data.", + ) + hf_dataset_group.add_argument( + "--shared_kv", + action="store_true", + help="Whether to enable share kv cache in the input data.", + ) + hf_dataset_group.add_argument( + "--generative", + action="store_true", + help="Whether to enable generative mode in the input data.", + ) + hf_dataset_group.add_argument( + "--ort_past_key_name", + type=str, + help="Past key name for the input data.", + ) + hf_dataset_group.add_argument( + "--ort_past_value_name", + type=str, + help="Past value name for the input data.", + ) + hf_dataset_group.add_argument( + "--trust_remote_code", + action="store_true", + help="Whether to trust remote code in the input data.", + ) + hf_dataset_group.add_argument( + "--max_samples", + type=int, + help="Max samples to use for the input data.", + ) + hf_dataset_group.add_argument( + "--fields_no_batch", + nargs="*", + help="List of fields that should not be batched.", + ) + + return hf_dataset_group + + @staticmethod + def _add_accelerator_options(sub_parser): + accelerator_group = sub_parser.add_argument_group("accelerator group") + + accelerator_group.add_argument( + "--device", + type=str, + default="cpu", + choices=["gpu", "cpu", "npu"], + help="Device to use for the model.", + ) + + accelerator_group.add_argument( + "--providers_list", + type=str, + nargs="*", + choices=[ + "CUDAExecutionProvider", + "DmlExecutionProvider", + "JsExecutionProvider", + "MIGraphXExecutionProvider", + "OpenVINOExecutionProvider", + "OpenVINOExecutionProvider", + "QNNExecutionProviderROCMExecutionProvider", + "TensorrtExecutionProvider", + ], + help=( + "List of execution providers to use for ONNX model. They are case sensitive. " + "If not provided, all available providers will be used." + ), + ) + + return accelerator_group + + def _update_accelerator_options(self, config): + to_replace = [ + (("systems", "local_system", "accelerators", 0, "device"), self.args.device), + ] + + if self.args.providers_list: + to_replace.append( + (("systems", "local_system", "accelerators", 0, "execution_providers"), self.args.providers_list) ) - config["azureml_client"] = { - "subscription_id": subscription_id, - "resource_group": args.resource_group, - "workspace_name": args.workspace_name, - "keyvault_name": args.keyvault_name, - "default_auth_params": {"exclude_managed_identity_credential": True}, - } - - conda_file_path = Path(tempdir) / "conda_gpu.yaml" - with open(conda_file_path, "w") as f: - yaml.dump(CONDA_CONFIG, f) - - config["systems"]["aml_system"] = { - "type": "AzureML", - "accelerators": [{"device": "GPU", "execution_providers": ["CUDAExecutionProvider"]}], - "aml_compute": args.aml_compute, - "aml_docker_config": { - "base_image": "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04", - "conda_file_path": str(conda_file_path), - }, - "hf_token": bool(args.keyvault_name), - } - config["workflow_host"] = "aml_system" - - -# TODO(team): Remove this function once the output structure is refactored -def get_output_model_number(outputs: Dict) -> int: - return sum(len(f.nodes) for f in outputs.values()) + for k, v in to_replace: + if v is not None: + set_nested_dict_value(config, k, v) + + def _is_remote_run(self): + return all([self.args.resource_group, self.args.workspace_name, self.args.aml_compute]) + + def _update_remote_option(self, config, cli_action, tempdir): + if self.args.resource_group or self.args.workspace_name or self.args.aml_compute: + if not self._is_remote_run(): + raise ValueError("resource_group, workspace_name and aml_compute are required for remote workflow run.") + + config["workflow_id"] = f"{cli_action}-{hash_dict(config)}" + + try: + subscription_id = json.loads(subprocess.check_output("az account show", shell=True).decode("utf-8"))[ + "id" + ] + print(f"Using Azure subscription ID: {subscription_id}") + + except subprocess.CalledProcessError: + print( + "Error: Unable to retrieve account information. " + "Make sure you are logged in to Azure CLI with command `az login`." + ) + + config["azureml_client"] = { + "subscription_id": subscription_id, + "resource_group": self.args.resource_group, + "workspace_name": self.args.workspace_name, + "keyvault_name": self.args.keyvault_name, + "default_auth_params": {"exclude_managed_identity_credential": True}, + } + + conda_file_path = Path(tempdir) / "conda_gpu.yaml" + with open(conda_file_path, "w") as f: + yaml.dump(CONDA_CONFIG, f) + + config["systems"]["aml_system"] = { + "type": "AzureML", + "accelerators": [{"device": "GPU", "execution_providers": ["CUDAExecutionProvider"]}], + "aml_compute": self.args.aml_compute, + "aml_docker_config": { + "base_image": "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04", + "conda_file_path": str(conda_file_path), + }, + "hf_token": bool(self.args.keyvault_name), + } + config["workflow_host"] = "aml_system" + + # TODO(team): Remove this function once the output structure is refactored + @staticmethod + def _get_output_model_number(outputs: Dict) -> int: + return sum(len(f.nodes) for f in outputs.values()) diff --git a/olive/cli/capture_onnx.py b/olive/cli/capture_onnx.py index 1454abb5d..e6b209c59 100644 --- a/olive/cli/capture_onnx.py +++ b/olive/cli/capture_onnx.py @@ -11,16 +11,7 @@ from pathlib import Path from typing import ClassVar, Dict -from olive.cli.base import ( - BaseOliveCLICommand, - add_hf_model_options, - add_logging_options, - add_remote_options, - get_model_name_or_path, - get_output_model_number, - is_remote_run, - update_remote_option, -) +from olive.cli.base import BaseOliveCLICommand from olive.common.utils import IntEnumBase, hardlink_copy_dir, set_nested_dict_value, set_tempdir @@ -41,10 +32,10 @@ def register_subcommand(parser: ArgumentParser): help=("Capture ONNX graph using PyTorch Exporter or Model Builder from the Huggingface model."), ) - add_logging_options(sub_parser) + CaptureOnnxGraphCommand._add_logging_options(sub_parser) # model options - add_hf_model_options(sub_parser) + CaptureOnnxGraphCommand._add_hf_model_options(sub_parser) sub_parser.add_argument( "--device", @@ -153,7 +144,7 @@ def register_subcommand(parser: ArgumentParser): ) # remote options - add_remote_options(sub_parser) + CaptureOnnxGraphCommand._add_remote_options(sub_parser) sub_parser.set_defaults(func=CaptureOnnxGraphCommand) @@ -167,18 +158,18 @@ def run(self): output = olive_run(run_config) - if is_remote_run(self.args): + if self._is_remote_run(): # TODO(jambayk): point user to datastore with outputs or download outputs # both are not implemented yet return - if get_output_model_number(output) > 0: + if CaptureOnnxGraphCommand._get_output_model_number(output) > 0: output_path = Path(self.args.output_path) output_path.mkdir(parents=True, exist_ok=True) pass_name = "m" if self.args.use_model_builder else "c" device_name = "gpu-cuda_model" if self.args.device == "gpu" else "cpu-cpu_model" hardlink_copy_dir(Path(tempdir) / pass_name / device_name, output_path) - print("ONNX Model is saved to %s", output_path.resolve()) + print(f"ONNX Model is saved to {output_path.resolve()}") else: print("Failed to run capture-onnx-graph. Please set the log_level to 1 for more detailed logs.") @@ -191,7 +182,10 @@ def get_run_config(self, tempdir: str) -> Dict: to_replace = [ ("output_dir", tempdir), ("log_severity_level", self.args.log_level), - (("input_model", "model_path"), get_model_name_or_path(self.args.model_name_or_path)), + ( + ("input_model", "model_path"), + CaptureOnnxGraphCommand._get_model_name_or_path(self.args.model_name_or_path), + ), (("input_model", "load_kwargs", "trust_remote_code"), self.args.trust_remote_code), (("systems", "local_system", "accelerators", 0, "device"), self.args.device), ( @@ -231,7 +225,7 @@ def get_run_config(self, tempdir: str) -> Dict: if value is None: continue set_nested_dict_value(config, keys, value) - update_remote_option(config, self.args, "capture-onnx-graph", tempdir) + self._update_remote_option(config, "capture-onnx-graph", tempdir) return config diff --git a/olive/cli/export_adapters.py b/olive/cli/export_adapters.py index dd7241f7e..2673c4ea4 100644 --- a/olive/cli/export_adapters.py +++ b/olive/cli/export_adapters.py @@ -136,7 +136,7 @@ def run(self): ) output_path = save_weights(transformed_weights, self.args.output_path, self.args.save_format) - print("Exported adapter weights to %s", output_path) + print(f"Exported adapter weights to {output_path}") return output_path diff --git a/olive/cli/finetune.py b/olive/cli/finetune.py index a9506387e..e88968199 100644 --- a/olive/cli/finetune.py +++ b/olive/cli/finetune.py @@ -11,17 +11,8 @@ from pathlib import Path from typing import ClassVar, Dict -from olive.cli.base import ( - BaseOliveCLICommand, - add_hf_model_options, - add_logging_options, - add_remote_options, - get_model_name_or_path, - get_output_model_number, - is_remote_run, - update_remote_option, -) -from olive.common.utils import hardlink_copy_dir, set_nested_dict_value, set_tempdir, unescaped_str +from olive.cli.base import BaseOliveCLICommand +from olive.common.utils import hardlink_copy_dir, set_nested_dict_value, set_tempdir class FineTuneCommand(BaseOliveCLICommand): @@ -37,7 +28,7 @@ def register_subcommand(parser: ArgumentParser): ), ) - add_logging_options(sub_parser) + FineTuneCommand._add_logging_options(sub_parser) # TODO(jambayk): option to list/install required dependencies? sub_parser.add_argument( @@ -49,7 +40,7 @@ def register_subcommand(parser: ArgumentParser): ) # Model options - add_hf_model_options(sub_parser) + FineTuneCommand._add_hf_model_options(sub_parser) sub_parser.add_argument( "--torch_dtype", @@ -63,42 +54,8 @@ def register_subcommand(parser: ArgumentParser): ) # Dataset options - dataset_group = sub_parser.add_argument_group("dataset options") - dataset_group.add_argument( - "-d", - "--data_name", - type=str, - required=True, - help="The dataset name.", - ) - # TODO(jambayk): currently only supports single file or list of files, support mapping - dataset_group.add_argument( - "--data_files", type=str, help="The dataset files. If multiple files, separate by comma." - ) - dataset_group.add_argument("--train_split", type=str, default="train", help="The split to use for training.") - dataset_group.add_argument( - "--eval_split", - default="", - help="The dataset split to evaluate on.", - ) - text_group = dataset_group.add_mutually_exclusive_group(required=True) - text_group.add_argument( - "--text_field", - type=str, - help="The text field to use for fine-tuning.", - ) - text_group.add_argument( - "--text_template", - # using special string type to allow for escaped characters like \n - type=unescaped_str, - help=r"Template to generate text field from. E.g. '### Question: {prompt} \n### Answer: {response}'", - ) - dataset_group.add_argument( - "--max_seq_len", - type=int, - default=1024, - help="Maximum sequence length for the data.", - ) + FineTuneCommand._add_dataset_options(sub_parser) + # LoRA options lora_group = sub_parser.add_argument_group("lora options") lora_group.add_argument( @@ -134,7 +91,7 @@ def register_subcommand(parser: ArgumentParser): sub_parser.add_argument("--clean", action="store_true", help="Run in a clean cache directory") # remote options - add_remote_options(sub_parser) + FineTuneCommand._add_remote_options(sub_parser) sub_parser.set_defaults(func=FineTuneCommand) @@ -148,17 +105,17 @@ def run(self): output = olive_run(run_config) - if is_remote_run(self.args): + if self._is_remote_run(): # TODO(jambayk): point user to datastore with outputs or download outputs # both are not implemented yet return - if get_output_model_number(output) > 0: + if FineTuneCommand._get_output_model_number(output) > 0: # need to improve the output structure of olive run output_path = Path(self.args.output_path) output_path.mkdir(parents=True, exist_ok=True) hardlink_copy_dir(Path(tempdir) / "-".join(run_config["passes"].keys()) / "gpu-cuda_model", output_path) - print("Model and adapters saved to %s", output_path.resolve()) + print(f"Model and adapters saved to {output_path.resolve()}") else: print("Failed to run finetune. Please set the log_level to 1 for more detailed logs.") @@ -178,21 +135,11 @@ def parse_training_args(self) -> Dict: return {k: v for k, v in vars(training_args).items() if k in arg_keys} def get_run_config(self, tempdir: str) -> Dict: - load_key = ("data_configs", 0, "load_dataset_config") - preprocess_key = ("data_configs", 0, "pre_process_data_config") + finetune_key = ("passes", "f") - model_path = get_model_name_or_path(self.args.model_name_or_path) + model_path = FineTuneCommand._get_model_name_or_path(self.args.model_name_or_path) to_replace = [ (("input_model", "model_path"), model_path), - ((*load_key, "data_name"), self.args.data_name), - ((*load_key, "split"), self.args.train_split), - ( - (*load_key, "data_files"), - self.args.data_files.split(",") if self.args.data_files else None, - ), - ((*preprocess_key, "text_cols"), self.args.text_field), - ((*preprocess_key, "text_template"), self.args.text_template), - ((*preprocess_key, "max_seq_len"), self.args.max_seq_len), ((*finetune_key, "type"), self.args.method), ((*finetune_key, "torch_dtype"), self.args.torch_dtype), ((*finetune_key, "training_args"), self.parse_training_args()), @@ -210,10 +157,11 @@ def get_run_config(self, tempdir: str) -> Dict: to_replace.append(((*finetune_key, "target_modules"), self.args.target_modules.split(","))) config = deepcopy(TEMPLATE) + self._update_dataset_options(config) + for keys, value in to_replace: - if value is None: - continue - set_nested_dict_value(config, keys, value) + if value is not None: + set_nested_dict_value(config, keys, value) if self.args.eval_split: eval_data_config = deepcopy(config["data_configs"][0]) @@ -225,7 +173,7 @@ def get_run_config(self, tempdir: str) -> Dict: if not self.args.use_ort_genai: del config["passes"]["m"] - update_remote_option(config, self.args, "finetune", tempdir) + self._update_remote_option(config, "finetune", tempdir) config["log_severity_level"] = self.args.log_level return config @@ -247,6 +195,8 @@ def get_run_config(self, tempdir: str) -> Dict: "type": "HuggingfaceContainer", "load_dataset_config": {}, "pre_process_data_config": {}, + "dataloader_config": {}, + "post_process_data_config": {}, } ], "passes": { diff --git a/olive/cli/launcher.py b/olive/cli/launcher.py index e758f9dfc..f34c14ad3 100644 --- a/olive/cli/launcher.py +++ b/olive/cli/launcher.py @@ -13,6 +13,7 @@ from olive.cli.finetune import FineTuneCommand from olive.cli.manage_aml_compute import ManageAMLComputeCommand from olive.cli.perf_tuning import PerfTuningCommand +from olive.cli.quantize import QuantizeCommand from olive.cli.run import WorkflowRunCommand @@ -33,6 +34,7 @@ def get_cli_parser(called_as_console_script: bool = True) -> ArgumentParser: ConfigureQualcommSDKCommand.register_subcommand(commands_parser) ManageAMLComputeCommand.register_subcommand(commands_parser) PerfTuningCommand.register_subcommand(commands_parser) + QuantizeCommand.register_subcommand(commands_parser) CloudCacheCommand.register_subcommand(commands_parser) return parser diff --git a/olive/cli/manage_aml_compute.py b/olive/cli/manage_aml_compute.py index f054b07ce..832bc8947 100644 --- a/olive/cli/manage_aml_compute.py +++ b/olive/cli/manage_aml_compute.py @@ -68,7 +68,7 @@ def run(self): ) if self.args.create: - print("Creating compute %s...", self.args.compute_name) + print(f"Creating compute {self.args.compute_name}...") if self.args.vm_size is None: raise ValueError("vm_size must be provided if operation is create") if self.args.location is None: @@ -84,19 +84,15 @@ def run(self): ) ml_client.begin_create_or_update(cluster_basic).result() print( - "Successfully created compute: %s at %s with vm_size:%s and " - "min_nodes=%d and max_nodes=%d and idle_time_before_scale_down=%d", - self.args.compute_name, - self.args.location, - self.args.vm_size, - self.args.min_nodes, - self.args.max_nodes, - self.args.idle_time_before_scale_down, + f"Successfully created compute: {self.args.compute_name} at {self.args.location} " + f"with vm_size:{self.args.vm_size} and min_nodes={self.args.min_nodes} and " + f"max_nodes={self.args.max_nodes} and " + f"idle_time_before_scale_down={self.args.idle_time_before_scale_down}" ) elif self.args.delete: - print("Deleting compute %s...", self.args.compute_name) + print(f"Deleting compute {self.args.compute_name}...") ml_client.compute.begin_delete(self.args.compute_name).wait() - print("Successfully deleted compute: %s", self.args.compute_name) + print(f"Successfully deleted compute: {self.args.compute_name}") @classmethod def get_ml_client(cls, aml_config_path: str, subscription_id: str, resource_group: str, workspace_name: str): diff --git a/olive/cli/perf_tuning.py b/olive/cli/perf_tuning.py index aa499dba5..7516a3e8b 100644 --- a/olive/cli/perf_tuning.py +++ b/olive/cli/perf_tuning.py @@ -15,14 +15,7 @@ import yaml from olive.auto_optimizer.template_mapping import PERF_TUNING_TEMPLATE -from olive.cli.base import ( - BaseOliveCLICommand, - add_logging_options, - add_remote_options, - get_output_model_number, - is_remote_run, - update_remote_option, -) +from olive.cli.base import BaseOliveCLICommand from olive.common.utils import set_nested_dict_value, set_tempdir from olive.data.config import DataConfig from olive.workflows import run as olive_run @@ -41,7 +34,7 @@ def register_subcommand(parser: ArgumentParser): "--hf_model_name hf_model_name --device device_type to get the tuned session parameters." ), ) - add_logging_options(sub_parser) + PerfTuningCommand._add_logging_options(sub_parser) # model options model_group = sub_parser.add_argument_group("model options") @@ -52,95 +45,20 @@ def register_subcommand(parser: ArgumentParser): ) # dataset options - dataset_group = sub_parser.add_argument_group( - "dataset options, which mutually exclusive with huggingface dataset options" - ) - dataset_group.add_argument( - "--data_config_path", - type=str, - help="Path to the data config file. It allows to customize the data config(json/yaml) for the model.", - ) - - hf_dataset_group = sub_parser.add_argument_group( - "huggingface dataset options, if dataset options are not provided, " - "user should provide the following options to modify the default data config. " - "Please refer to olive.data.container.TransformersTokenDummyDataContainer for more details." - ) + PerfTuningCommand._add_dataconfig_options(sub_parser) + hf_dataset_group = PerfTuningCommand._add_hf_dataset_options(sub_parser) hf_dataset_group.add_argument( "--predict_with_kv_cache", action="store_true", help="Whether to use key-value cache for perf_tuning", ) - hf_dataset_group.add_argument( - "--hf_model_name", - help="Huggingface model name used to load model configs from huggingface.", - ) - hf_dataset_group.add_argument( - "--batch_size", - type=int, - help="Batch size of the input data.", - ) - hf_dataset_group.add_argument( - "--seq_len", - type=int, - help="Sequence length to use for the input data.", - ) - hf_dataset_group.add_argument( - "--past_seq_len", - type=int, - help="Past sequence length to use for the input data.", - ) - hf_dataset_group.add_argument( - "--max_seq_len", - type=int, - help="Max sequence length to use for the input data.", - ) - hf_dataset_group.add_argument( - "--shared_kv", - action="store_true", - help="Whether to enable share kv cache in the input data.", - ) - hf_dataset_group.add_argument( - "--generative", - action="store_true", - help="Whether to enable generative mode in the input data.", - ) - hf_dataset_group.add_argument( - "--ort_past_key_name", - type=str, - help="Past key name for the input data.", - ) - hf_dataset_group.add_argument( - "--ort_past_value_name", - type=str, - help="Past value name for the input data.", - ) - hf_dataset_group.add_argument( - "--trust_remote_code", - action="store_true", - help="Whether to trust remote code in the input data.", - ) - hf_dataset_group.add_argument( - "--max_samples", - type=int, - help="Max samples to use for the input data.", - ) - hf_dataset_group.add_argument( - "--fields_no_batch", - nargs="*", - help="List of fields that should not be batched.", - ) # pass options pass_group = sub_parser.add_argument_group("pass options") - pass_group.add_argument( - "--device", - type=str, - default="cpu", - choices=["gpu", "cpu"], - help="Device to use for the model.", - ) + # accelerator options + PerfTuningCommand._add_accelerator_options(sub_parser) + pass_group.add_argument( "--cpu_cores", type=int, @@ -157,15 +75,6 @@ def register_subcommand(parser: ArgumentParser): action="store_true", help="Whether enable CUDA Graph for CUDA execution provider.", ) - pass_group.add_argument( - "--providers_list", - type=str, - nargs="*", - help=( - "List of execution providers to use for ONNX model. They are case sensitive. " - "If not provided, all available providers will be used." - ), - ) pass_group.add_argument( "--execution_mode_list", type=int, nargs="*", help="Parallelism list between operators." ) @@ -211,7 +120,7 @@ def register_subcommand(parser: ArgumentParser): ) # remote options - add_remote_options(sub_parser) + PerfTuningCommand._add_remote_options(sub_parser) sub_parser.set_defaults(func=PerfTuningCommand) @@ -302,12 +211,12 @@ def get_run_config(self, tempdir) -> Dict: to_replace.append((system_ep_key, self.args.providers_list)) config = deepcopy(template_config) - for k, v in to_replace: - if v is None: - continue - set_nested_dict_value(config, k, v) + self._update_accelerator_options(config) + self._update_remote_option(config, "perf-tuning", tempdir) - update_remote_option(config, self.args, "perf-tuning", tempdir) + for k, v in to_replace: + if v is not None: + set_nested_dict_value(config, k, v) config["log_severity_level"] = self.args.log_level return config @@ -320,12 +229,12 @@ def run(self): run_config["output_dir"] = tempdir output = olive_run(run_config) - if is_remote_run(self.args): + if self._is_remote_run(): # TODO(jambayk): point user to datastore with outputs or download outputs # both are not implemented yet return - if get_output_model_number(output) > 0: + if PerfTuningCommand._get_output_model_number(output) > 0: # need to improve the output structure of olive run output_path = Path(self.args.output_path) output_path.mkdir(parents=True, exist_ok=True) @@ -336,6 +245,6 @@ def run(self): with rls_json_path.open() as f: infer_settings = json.load(f)["config"]["inference_settings"] json.dump(infer_settings, infer_setting_output_path.open("w"), indent=4) - print("Inference session parameters are saved to %s", output_path.resolve()) + print(f"Inference session parameters are saved to {output_path.resolve()}") else: print("Failed to run tune-session-params. Please set the log_level to 1 for more detailed logs.") diff --git a/olive/cli/quantize.py b/olive/cli/quantize.py new file mode 100644 index 000000000..28b9ebdf8 --- /dev/null +++ b/olive/cli/quantize.py @@ -0,0 +1,153 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +# ruff: noqa: T201 +# ruff: noqa: RUF012 + +import tempfile +from argparse import ArgumentParser +from copy import deepcopy +from pathlib import Path +from typing import Any, Dict + +from olive.cli.base import BaseOliveCLICommand +from olive.common.utils import hardlink_copy_dir, set_nested_dict_value, set_tempdir + + +class QuantizeCommand(BaseOliveCLICommand): + _CONFIG_TEMPLATE = { + "input_model": {"type": "HfModel", "load_kwargs": {"attn_implementation": "eager"}}, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [{"device": "gpu", "execution_providers": ["CUDAExecutionProvider"]}], + } + }, + "data_configs": [ + { + "name": "default_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": {}, + "pre_process_data_config": {}, + "dataloader_config": {}, + "post_process_data_config": {}, + } + ], + "passes": { + "awq": {"type": "AutoAWQQuantizer"}, + "gptq": { + # Ref: https://github.com/AutoGPTQ/AutoGPTQ/pull/651/files + "type": "GptqQuantizer", + "data_config": "default_data_config", + }, + "quarot": { + "type": "QuaRot", + "w_rtn": True, + "rotate": True, + "w_bits": 4, + "a_bits": 4, + "k_bits": 4, + "v_bits": 4, + "calibration_data_config": None, + }, + }, + "pass_flows": [], + "cache_dir": "cache", + "output_dir": "models", + "host": "local_system", + "target": "local_system", + } + + @staticmethod + def register_subcommand(parser: ArgumentParser): + sub_parser = parser.add_parser( + "quantize", + help="Quantize the input model", + ) + + # Logging options + QuantizeCommand._add_logging_options(sub_parser) + + sub_parser.add_argument( + "-o", + "--output_path", + type=str, + required=True, + help="Path to save quantized model weights.", + ) + sub_parser.add_argument( + "--tempdir", default=None, type=str, help="Root directory for tempfile directories and files" + ) + sub_parser.add_argument( + "--algorithms", + type=str, + nargs="*", + required=True, + choices=sorted(QuantizeCommand._CONFIG_TEMPLATE["passes"].keys()), + help="List of quantization algorithms to run.", + ) + + # model options + QuantizeCommand._add_hf_model_options(sub_parser) + + # dataset options + QuantizeCommand._add_dataset_options(sub_parser) + + # accelerator options + QuantizeCommand._add_accelerator_options(sub_parser) + + # remote options + QuantizeCommand._add_remote_options(sub_parser) + + sub_parser.set_defaults(func=QuantizeCommand) + + def _get_run_config(self, tempdir: str) -> Dict[str, Any]: + to_replace = [ + (("input_model", "model_path"), self.args.model_name_or_path), + (("input_model", "load_kwargs", "trust_remote_code"), self.args.trust_remote_code), + (("pass_flows"), [[name] for name in self.args.algorithms]), + (("output_dir"), tempdir), + ] + + if self.args.task: + to_replace.append((("input_model", "task"), self.args.task)) + + config = deepcopy(QuantizeCommand._CONFIG_TEMPLATE) + self._update_dataset_options(config) + self._update_accelerator_options(config) + config["log_severity_level"] = self.args.log_level + + for k, v in to_replace: + if v is not None: + set_nested_dict_value(config, k, v) + + return config + + def run(self): + from olive.workflows import run as olive_run + + set_tempdir(self.args.tempdir) + + with tempfile.TemporaryDirectory() as tempdir: + run_config = self._get_run_config(tempdir) + output = olive_run(run_config) + + if self._is_remote_run(): + # TODO(jambayk): point user to datastore with outputs or download outputs + # both are not implemented yet + return + + if QuantizeCommand._get_output_model_number(output) > 0: + # need to improve the output structure of olive run + output_path = Path(self.args.output_path) + output_path.mkdir(parents=True, exist_ok=True) + device_name = "gpu-cuda_model" if self.args.device == "gpu" else "cpu-cpu_model" + for algorithm_name in self.args.algorithms: + hardlink_copy_dir( + Path(tempdir) / algorithm_name / device_name / "model", output_path / algorithm_name + ) + print(f"Quantized models saved to {output_path.resolve()}") + else: + print("Failed to run quantize. Please set the log_level to 1 for more detailed logs.") diff --git a/olive/common/hf/mappings.py b/olive/common/hf/mappings.py index 723928a37..e13bf2d86 100644 --- a/olive/common/hf/mappings.py +++ b/olive/common/hf/mappings.py @@ -70,3 +70,16 @@ "llama": "gpt2", "roberta": "bert", } + +MODEL_OUTSIDE_LAYER_MODULES = { + "phi3": ["model.embed_tokens", "embed_dropout", "model.norm"], +} + +MODEL_INSIDE_LAYER_MODULES = { + "phi3": [ + ["self_attn.qkv_proj"], + ["self_attn.o_proj"], + ["mlp.gate_up_proj"], + ["mlp.down_proj"], + ] +} diff --git a/olive/passes/pytorch/gptq.py b/olive/passes/pytorch/gptq.py index b39bae832..02e3936c3 100644 --- a/olive/passes/pytorch/gptq.py +++ b/olive/passes/pytorch/gptq.py @@ -11,6 +11,7 @@ import torch from olive.common.config_utils import validate_config +from olive.common.hf.mappings import MODEL_INSIDE_LAYER_MODULES, MODEL_OUTSIDE_LAYER_MODULES from olive.data.config import DataConfig from olive.hardware.accelerator import AcceleratorSpec, Device from olive.model import HfModelHandler, PyTorchModelHandler @@ -166,20 +167,26 @@ def _run_for_config( def get_onnx_quant_linear(*args, **kwargs): return QuantLinear - if hasattr(pytorch_model, "config") and pytorch_model.config.model_type in GPTQ_CAUSAL_LM_MODEL_MAP: - model_type = pytorch_model.config.model_type - model_class = GPTQ_CAUSAL_LM_MODEL_MAP[model_type] - quantized_model = model_class(pytorch_model, False, quantize_config) - else: - quantized_model = BaseGPTQForCausalLM(pytorch_model, False, quantize_config) - if not (config["layers_block_name"] and config["outside_layer_modules"] and config["inside_layer_modules"]): - raise ValueError( - "Can't get layers_block_name to quantize automatically, " - "please set layers_block_name, outside_layer_modules and inside_layer_modules in config." - ) - quantized_model.layers_block_name = config["layers_block_name"] + model_type = pytorch_model.config.model_type if hasattr(pytorch_model, "config") else "" + model_class = GPTQ_CAUSAL_LM_MODEL_MAP.get(model_type, BaseGPTQForCausalLM) + quantized_model = model_class(pytorch_model, False, quantize_config) + + if config["inside_layer_modules"]: quantized_model.outside_layer_modules = config["outside_layer_modules"] + elif model_type in MODEL_OUTSIDE_LAYER_MODULES: + quantized_model.outside_layer_modules = MODEL_OUTSIDE_LAYER_MODULES[model_type] + else: + raise ValueError("Can't get outside_layer_modules to quantize automatically, please provide it in config.") + + if config["inside_layer_modules"]: quantized_model.inside_layer_modules = config["inside_layer_modules"] + elif model_type in MODEL_INSIDE_LAYER_MODULES: + quantized_model.inside_layer_modules = MODEL_INSIDE_LAYER_MODULES[model_type] + else: + raise ValueError("Can't get inside_layer_modules to quantize automatically, please provide it in config.") + + if config["layers_block_name"]: + quantized_model.layers_block_name = config["layers_block_name"] import auto_gptq diff --git a/test/unit_test/cli/test_cli.py b/test/unit_test/cli/test_cli.py index 393ca23d5..b58b1a21e 100644 --- a/test/unit_test/cli/test_cli.py +++ b/test/unit_test/cli/test_cli.py @@ -245,5 +245,47 @@ def test_cloud_cache_command(mock_container_client, test_set): mock_container_client().delete_blob.assert_called_once() +@pytest.mark.parametrize("algorithm_names", [{"awq"}, {"awq", "gptq"}]) +@patch("olive.workflows.run") +@patch("olive.cli.finetune.tempfile.TemporaryDirectory") +def test_quantize_command(mock_tempdir, mock_run, algorithm_names, tmp_path): + # some directories + tmpdir = tmp_path / "tmpdir" + tmpdir.mkdir() + + output_dir = tmp_path / "output_dir" + + # setup + mock_tempdir.return_value = tmpdir.resolve() + mock_run.return_value = {"output_dir": Footprint(nodes={"dummy_output": "dummy_output"})} + + for algo_name in algorithm_names: + workflow_output_dir = tmpdir / algo_name / "cpu-cpu_model" / "model" + workflow_output_dir.mkdir(parents=True) + dummy_model = workflow_output_dir / "dummy_model" + with dummy_model.open("w") as f: + f.write("dummy_model") + + # setup + command_args = [ + "quantize", + "-m", + "dummy_model", + "-d", + "dummy_dataset", + "--algorithms", + *algorithm_names, + "-o", + str(output_dir), + ] + + # execute + cli_main(command_args) + + config = mock_run.call_args[0][0] + assert config["input_model"]["model_path"] == "dummy_model" + assert {el.name for el in output_dir.iterdir()} == algorithm_names + + # TODO(anyone): Add tests for ManageAMLComputeCommand # Test for ExportAdaptersCommand is added as part of test/unit_test/passes/onnx/test_export_adapters.py